bitkeeper revision 1.1159.261.3 (420e3c341h1fbkH3NCtXo63yPlvjGg)

author mafetter@fleming.research <mafetter@fleming.research>

Sat, 12 Feb 2005 17:26:12 +0000 (17:26 +0000)

committer mafetter@fleming.research <mafetter@fleming.research>

Sat, 12 Feb 2005 17:26:12 +0000 (17:26 +0000)
author mafetter@fleming.research <mafetter@fleming.research>
Sat, 12 Feb 2005 17:26:12 +0000 (17:26 +0000)
committer mafetter@fleming.research <mafetter@fleming.research>
Sat, 12 Feb 2005 17:26:12 +0000 (17:26 +0000)
diff --cc .rootkeys
Simple merge
diff --cc linux-2.4.29-xen-sparse/arch/xen/kernel/setup.c
Simple merge
diff --cc linux-2.6.10-xen-sparse/arch/xen/i386/kernel/setup.c
Simple merge
diff --cc linux-2.6.10-xen-sparse/drivers/xen/blkfront/block.h
Simple merge
diff --cc linux-2.6.10-xen-sparse/drivers/xen/netback/netback.c
Simple merge
diff --cc linux-2.6.10-xen-sparse/drivers/xen/netfront/netfront.c
Simple merge
diff --cc xen/arch/x86/domain.c

index a9c90476ab23e55df0aa668abd6acd9636e37554,e4c427d402122bd424dbbb916eabd46a093b9aed..b83e6060818b2f1e99a628812055040291e4d4a2
--- 1/xen/arch/x86/domain.c
--- 2/xen/arch/x86/domain.c
+++ b/xen/arch/x86/domain.c
@@@ -778,19 -577,13 +778,20 @@@ void domain_relinquish_memory(struct do
       shadow_mode_disable(d);
   
       /* Drop the in-use reference to the page-table base. */
- -    if ( pagetable_val(d->mm.pagetable) != 0 )
+ +    for_each_exec_domain ( d, ed )
       {
- -        put_page_and_type(&frame_table[pagetable_val(d->mm.pagetable) >>
- -                                      PAGE_SHIFT]);
- -      d->mm.pagetable = mk_pagetable(0);
+ +        if ( pagetable_val(ed->arch.pagetable) != 0 )
+ +            put_page_and_type(&frame_table[pagetable_val(ed->arch.pagetable) >>
+ +                                           PAGE_SHIFT]);
++        ed->arch.pagetable = mk_pagetable(0);
       }
   
+ +#ifdef CONFIG_VMX
+ +    if ( VMX_DOMAIN(d->exec_domain[0]) )
+ +        for_each_exec_domain ( d, ed )
+ +            vmx_domain_relinquish_memory(ed);
+ +#endif
+ +
       /*
        * Relinquish GDT mappings. No need for explicit unmapping of the LDT as 
        * it automatically gets squashed when the guest's mappings go away.
diff --cc xen/arch/x86/extable.c

index 907590e6cbdc14e883a110def6daa5a4169f1627,bb70a3842b3ff5a0db17896249f0c921773f4a2e..8a0c0b81db89a499ae8d0525872e57f93872b2f0
--- 1/xen/arch/x86/extable.c
--- 2/xen/arch/x86/extable.c
+++ b/xen/arch/x86/extable.c
@@@ -63,10 -63,11 +63,11 @@@ search_exception_table(unsigned long ad
   }
   
   unsigned long
- search_pre_exception_table(unsigned long addr)
+ search_pre_exception_table(struct xen_regs *regs)
   {
+     unsigned long addr = (unsigned long)regs->eip;
       unsigned long fixup = search_one_table(
           __start___pre_ex_table, __stop___pre_ex_table-1, addr);
- -    DPRINTK("Pre-exception: %08lx -> %08lx\n", addr, fixup);
+ +    DPRINTK("Pre-exception: %p -> %p\n", addr, fixup);
       return fixup;
   }
diff --cc xen/arch/x86/mm.c

index bbeab6dd63ac88f233df104707c25da6fb469b62,0000000000000000000000000000000000000000..9d6b80c718363f5b022627e8b0f1d176b5966dcf

mode 100644,000000..100644
--- 1/xen/arch/x86/mm.c
--- /dev/null
+++ b/xen/arch/x86/mm.c
@@@ -1,2596 -1,0 +1,2615 @@@
-              * Since L2's are guranteed RW, failure indicates the page was not 
-              * shadowed, so ignore.
+ +/* -*-  Mode:C; c-basic-offset:4; tab-width:4; indent-tabs-mode:nil -*- */
+ +/******************************************************************************
+ + * arch/x86/mm.c
+ + * 
+ + * Copyright (c) 2002-2005 K A Fraser
+ + * Copyright (c) 2004 Christian Limpach
+ + * 
+ + * This program is free software; you can redistribute it and/or modify
+ + * it under the terms of the GNU General Public License as published by
+ + * the Free Software Foundation; either version 2 of the License, or
+ + * (at your option) any later version.
+ + * 
+ + * This program is distributed in the hope that it will be useful,
+ + * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ + * GNU General Public License for more details.
+ + * 
+ + * You should have received a copy of the GNU General Public License
+ + * along with this program; if not, write to the Free Software
+ + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ + */
+ +
+ +/*
+ + * A description of the x86 page table API:
+ + * 
+ + * Domains trap to do_mmu_update with a list of update requests.
+ + * This is a list of (ptr, val) pairs, where the requested operation
+ + * is *ptr = val.
+ + * 
+ + * Reference counting of pages:
+ + * ----------------------------
+ + * Each page has two refcounts: tot_count and type_count.
+ + * 
+ + * TOT_COUNT is the obvious reference count. It counts all uses of a
+ + * physical page frame by a domain, including uses as a page directory,
+ + * a page table, or simple mappings via a PTE. This count prevents a
+ + * domain from releasing a frame back to the free pool when it still holds
+ + * a reference to it.
+ + * 
+ + * TYPE_COUNT is more subtle. A frame can be put to one of three
+ + * mutually-exclusive uses: it might be used as a page directory, or a
+ + * page table, or it may be mapped writable by the domain [of course, a
+ + * frame may not be used in any of these three ways!].
+ + * So, type_count is a count of the number of times a frame is being 
+ + * referred to in its current incarnation. Therefore, a page can only
+ + * change its type when its type count is zero.
+ + * 
+ + * Pinning the page type:
+ + * ----------------------
+ + * The type of a page can be pinned/unpinned with the commands
+ + * MMUEXT_[UN]PIN_L?_TABLE. Each page can be pinned exactly once (that is,
+ + * pinning is not reference counted, so it can't be nested).
+ + * This is useful to prevent a page's type count falling to zero, at which
+ + * point safety checks would need to be carried out next time the count
+ + * is increased again.
+ + * 
+ + * A further note on writable page mappings:
+ + * -----------------------------------------
+ + * For simplicity, the count of writable mappings for a page may not
+ + * correspond to reality. The 'writable count' is incremented for every
+ + * PTE which maps the page with the _PAGE_RW flag set. However, for
+ + * write access to be possible the page directory entry must also have
+ + * its _PAGE_RW bit set. We do not check this as it complicates the 
+ + * reference counting considerably [consider the case of multiple
+ + * directory entries referencing a single page table, some with the RW
+ + * bit set, others not -- it starts getting a bit messy].
+ + * In normal use, this simplification shouldn't be a problem.
+ + * However, the logic can be added if required.
+ + * 
+ + * One more note on read-only page mappings:
+ + * -----------------------------------------
+ + * We want domains to be able to map pages for read-only access. The
+ + * main reason is that page tables and directories should be readable
+ + * by a domain, but it would not be safe for them to be writable.
+ + * However, domains have free access to rings 1 & 2 of the Intel
+ + * privilege model. In terms of page protection, these are considered
+ + * to be part of 'supervisor mode'. The WP bit in CR0 controls whether
+ + * read-only restrictions are respected in supervisor mode -- if the 
+ + * bit is clear then any mapped page is writable.
+ + * 
+ + * We get round this by always setting the WP bit and disallowing 
+ + * updates to it. This is very unlikely to cause a problem for guest
+ + * OS's, which will generally use the WP bit to simplify copy-on-write
+ + * implementation (in that case, OS wants a fault when it writes to
+ + * an application-supplied buffer).
+ + */
+ +
+ +#include <xen/config.h>
+ +#include <xen/init.h>
+ +#include <xen/kernel.h>
+ +#include <xen/lib.h>
+ +#include <xen/mm.h>
+ +#include <xen/sched.h>
+ +#include <xen/errno.h>
+ +#include <xen/perfc.h>
+ +#include <xen/irq.h>
+ +#include <xen/softirq.h>
+ +#include <asm/shadow.h>
+ +#include <asm/page.h>
+ +#include <asm/flushtlb.h>
+ +#include <asm/io.h>
+ +#include <asm/uaccess.h>
+ +#include <asm/domain_page.h>
+ +#include <asm/ldt.h>
+ +
+ +#ifdef VERBOSE
+ +#define MEM_LOG(_f, _a...)                           \
+ +  printk("DOM%u: (file=memory.c, line=%d) " _f "\n", \
+ +         current->domain->id , __LINE__ , ## _a )
+ +#else
+ +#define MEM_LOG(_f, _a...) ((void)0)
+ +#endif
+ +
+ +static int alloc_l2_table(struct pfn_info *page);
+ +static int alloc_l1_table(struct pfn_info *page);
+ +static int get_page_from_pagenr(unsigned long page_nr, struct domain *d);
+ +static int get_page_and_type_from_pagenr(unsigned long page_nr, 
+ +                                         u32 type,
+ +                                         struct domain *d);
+ +
+ +static void free_l2_table(struct pfn_info *page);
+ +static void free_l1_table(struct pfn_info *page);
+ +
+ +static int mod_l2_entry(l2_pgentry_t *, l2_pgentry_t, unsigned long);
+ +static int mod_l1_entry(l1_pgentry_t *, l1_pgentry_t);
+ +
+ +/* Used to defer flushing of memory structures. */
+ +static struct {
+ +#define DOP_FLUSH_TLB   (1<<0) /* Flush the TLB.                 */
+ +#define DOP_RELOAD_LDT  (1<<1) /* Reload the LDT shadow mapping. */
+ +    unsigned long  deferred_ops;
+ +    /* If non-NULL, specifies a foreign subject domain for some operations. */
+ +    struct domain *foreign;
+ +} __cacheline_aligned percpu_info[NR_CPUS];
+ +
+ +/*
+ + * Returns the current foreign domain; defaults to the currently-executing
+ + * domain if a foreign override hasn't been specified.
+ + */
+ +#define FOREIGNDOM (percpu_info[smp_processor_id()].foreign ? : current->domain)
+ +
+ +/* Private domain structs for DOMID_XEN and DOMID_IO. */
+ +static struct domain *dom_xen, *dom_io;
+ +
+ +/* Frame table and its size in pages. */
+ +struct pfn_info *frame_table;
+ +unsigned long frame_table_size;
+ +unsigned long max_page;
+ +
+ +void __init init_frametable(void)
+ +{
+ +    unsigned long i, p;
+ +
+ +    frame_table      = (struct pfn_info *)FRAMETABLE_VIRT_START;
+ +    frame_table_size = max_page * sizeof(struct pfn_info);
+ +    frame_table_size = (frame_table_size + PAGE_SIZE - 1) & PAGE_MASK;
+ +
+ +    for ( i = 0; i < frame_table_size; i += (4UL << 20) )
+ +    {
+ +        p = alloc_boot_pages(min(frame_table_size - i, 4UL << 20), 4UL << 20);
+ +        if ( p == 0 )
+ +            panic("Not enough memory for frame table\n");
+ +        map_pages(idle_pg_table, FRAMETABLE_VIRT_START + i, p, 
+ +                  4UL << 20, PAGE_HYPERVISOR);
+ +    }
+ +
+ +    memset(frame_table, 0, frame_table_size);
+ +}
+ +
+ +void arch_init_memory(void)
+ +{
+ +    extern void subarch_init_memory(struct domain *);
+ +
+ +    memset(percpu_info, 0, sizeof(percpu_info));
+ +
+ +    /*
+ +     * Initialise our DOMID_XEN domain.
+ +     * Any Xen-heap pages that we will allow to be mapped will have
+ +     * their domain field set to dom_xen.
+ +     */
+ +    dom_xen = alloc_domain_struct();
+ +    atomic_set(&dom_xen->refcnt, 1);
+ +    dom_xen->id = DOMID_XEN;
+ +
+ +    /*
+ +     * Initialise our DOMID_IO domain.
+ +     * This domain owns no pages but is considered a special case when
+ +     * mapping I/O pages, as the mappings occur at the priv of the caller.
+ +     */
+ +    dom_io = alloc_domain_struct();
+ +    atomic_set(&dom_io->refcnt, 1);
+ +    dom_io->id = DOMID_IO;
+ +
+ +    subarch_init_memory(dom_xen);
+ +}
+ +
+ +void write_ptbase(struct exec_domain *ed)
+ +{
+ +    struct domain *d = ed->domain;
+ +    unsigned long pa;
+ +
+ +#ifdef CONFIG_VMX
+ +    if ( unlikely(shadow_mode(d)) )
+ +        pa = ((shadow_mode(d) == SHM_full_32) ?
+ +              pagetable_val(ed->arch.monitor_table) :
+ +              pagetable_val(ed->arch.shadow_table));
+ +    else
+ +        pa = pagetable_val(ed->arch.pagetable);
+ +#else
+ +    if ( unlikely(shadow_mode(d)) )
+ +        pa = pagetable_val(ed->arch.shadow_table);    
+ +    else
+ +        pa = pagetable_val(ed->arch.pagetable);
+ +#endif
+ +
+ +    write_cr3(pa);
+ +}
+ +
+ +static void __invalidate_shadow_ldt(struct exec_domain *d)
+ +{
+ +    int i;
+ +    unsigned long pfn;
+ +    struct pfn_info *page;
+ +    
+ +    d->arch.shadow_ldt_mapcnt = 0;
+ +
+ +    for ( i = 16; i < 32; i++ )
+ +    {
+ +        pfn = l1_pgentry_to_pfn(d->arch.perdomain_ptes[i]);
+ +        if ( pfn == 0 ) continue;
+ +        d->arch.perdomain_ptes[i] = mk_l1_pgentry(0);
+ +        page = &frame_table[pfn];
+ +        ASSERT_PAGE_IS_TYPE(page, PGT_ldt_page);
+ +        ASSERT_PAGE_IS_DOMAIN(page, d->domain);
+ +        put_page_and_type(page);
+ +    }
+ +
+ +    /* Dispose of the (now possibly invalid) mappings from the TLB.  */
+ +    percpu_info[d->processor].deferred_ops |= DOP_FLUSH_TLB | DOP_RELOAD_LDT;
+ +}
+ +
+ +
+ +static inline void invalidate_shadow_ldt(struct exec_domain *d)
+ +{
+ +    if ( d->arch.shadow_ldt_mapcnt != 0 )
+ +        __invalidate_shadow_ldt(d);
+ +}
+ +
+ +
+ +static int alloc_segdesc_page(struct pfn_info *page)
+ +{
+ +    struct desc_struct *descs;
+ +    int i;
+ +
+ +    descs = map_domain_mem((page-frame_table) << PAGE_SHIFT);
+ +
+ +    for ( i = 0; i < 512; i++ )
+ +        if ( unlikely(!check_descriptor(&descs[i])) )
+ +            goto fail;
+ +
+ +    unmap_domain_mem(descs);
+ +    return 1;
+ +
+ + fail:
+ +    unmap_domain_mem(descs);
+ +    return 0;
+ +}
+ +
+ +
+ +/* Map shadow page at offset @off. */
+ +int map_ldt_shadow_page(unsigned int off)
+ +{
+ +    struct exec_domain *ed = current;
+ +    struct domain *d = ed->domain;
+ +    unsigned long l1e;
+ +
+ +    if ( unlikely(in_irq()) )
+ +        BUG();
+ +
+ +    __get_user(l1e, (unsigned long *)
+ +               &linear_pg_table[l1_linear_offset(ed->arch.ldt_base) + off]);
+ +
+ +    if ( unlikely(!(l1e & _PAGE_PRESENT)) ||
+ +         unlikely(!get_page_and_type(&frame_table[l1e >> PAGE_SHIFT], 
+ +                                     d, PGT_ldt_page)) )
+ +        return 0;
+ +
+ +    ed->arch.perdomain_ptes[off + 16] = mk_l1_pgentry(l1e | _PAGE_RW);
+ +    ed->arch.shadow_ldt_mapcnt++;
+ +
+ +    return 1;
+ +}
+ +
+ +
+ +static int get_page_from_pagenr(unsigned long page_nr, struct domain *d)
+ +{
+ +    struct pfn_info *page = &frame_table[page_nr];
+ +
+ +    if ( unlikely(!pfn_is_ram(page_nr)) )
+ +    {
+ +        MEM_LOG("Pfn %p is not RAM", page_nr);
+ +        return 0;
+ +    }
+ +
+ +    if ( unlikely(!get_page(page, d)) )
+ +    {
+ +        MEM_LOG("Could not get page ref for pfn %p", page_nr);
+ +        return 0;
+ +    }
+ +
+ +    return 1;
+ +}
+ +
+ +
+ +static int get_page_and_type_from_pagenr(unsigned long page_nr, 
+ +                                         u32 type,
+ +                                         struct domain *d)
+ +{
+ +    struct pfn_info *page = &frame_table[page_nr];
+ +
+ +    if ( unlikely(!get_page_from_pagenr(page_nr, d)) )
+ +        return 0;
+ +
+ +    if ( unlikely(!get_page_type(page, type)) )
+ +    {
+ +        if ( (type & PGT_type_mask) != PGT_l1_page_table )
+ +            MEM_LOG("Bad page type for pfn %p (%08x)", 
+ +                    page_nr, page->u.inuse.type_info);
+ +        put_page(page);
+ +        return 0;
+ +    }
+ +
+ +    return 1;
+ +}
+ +
+ +
+ +/*
+ + * We allow an L2 tables to map each other (a.k.a. linear page tables). It
+ + * needs some special care with reference counst and access permissions:
+ + *  1. The mapping entry must be read-only, or the guest may get write access
+ + *     to its own PTEs.
+ + *  2. We must only bump the reference counts for an *already validated*
+ + *     L2 table, or we can end up in a deadlock in get_page_type() by waiting
+ + *     on a validation that is required to complete that validation.
+ + *  3. We only need to increment the reference counts for the mapped page
+ + *     frame if it is mapped by a different L2 table. This is sufficient and
+ + *     also necessary to allow validation of an L2 table mapping itself.
+ + */
+ +static int 
+ +get_linear_pagetable(
+ +    l2_pgentry_t l2e, unsigned long pfn, struct domain *d)
+ +{
+ +    u32 x, y;
+ +    struct pfn_info *page;
+ +
+ +    if ( (l2_pgentry_val(l2e) & _PAGE_RW) )
+ +    {
+ +        MEM_LOG("Attempt to create linear p.t. with write perms");
+ +        return 0;
+ +    }
+ +
+ +    if ( (l2_pgentry_val(l2e) >> PAGE_SHIFT) != pfn )
+ +    {
+ +        /* Make sure the mapped frame belongs to the correct domain. */
+ +        if ( unlikely(!get_page_from_pagenr(l2_pgentry_to_pfn(l2e), d)) )
+ +            return 0;
+ +
+ +        /*
+ +         * Make sure that the mapped frame is an already-validated L2 table. 
+ +         * If so, atomically increment the count (checking for overflow).
+ +         */
+ +        page = &frame_table[l2_pgentry_to_pfn(l2e)];
+ +        y = page->u.inuse.type_info;
+ +        do {
+ +            x = y;
+ +            if ( unlikely((x & PGT_count_mask) == PGT_count_mask) ||
+ +                 unlikely((x & (PGT_type_mask|PGT_validated)) != 
+ +                          (PGT_l2_page_table|PGT_validated)) )
+ +            {
+ +                put_page(page);
+ +                return 0;
+ +            }
+ +        }
+ +        while ( (y = cmpxchg(&page->u.inuse.type_info, x, x + 1)) != x );
+ +    }
+ +
+ +    return 1;
+ +}
+ +
+ +
+ +static int
+ +get_page_from_l1e(
+ +    l1_pgentry_t l1e, struct domain *d)
+ +{
+ +    unsigned long l1v = l1_pgentry_val(l1e);
+ +    unsigned long pfn = l1_pgentry_to_pfn(l1e);
+ +    struct pfn_info *page = &frame_table[pfn];
+ +    extern int domain_iomem_in_pfn(struct domain *d, unsigned long pfn);
+ +
+ +    if ( !(l1v & _PAGE_PRESENT) )
+ +        return 1;
+ +
+ +    if ( unlikely(l1v & (_PAGE_GLOBAL|_PAGE_PAT)) )
+ +    {
+ +        MEM_LOG("Bad L1 type settings %04lx", l1v & (_PAGE_GLOBAL|_PAGE_PAT));
+ +        return 0;
+ +    }
+ +
+ +    if ( unlikely(!pfn_is_ram(pfn)) )
+ +    {
+ +        /* Revert to caller privileges if FD == DOMID_IO. */
+ +        if ( d == dom_io )
+ +            d = current->domain;
+ +
+ +        if ( IS_PRIV(d) )
+ +            return 1;
+ +
+ +        if ( IS_CAPABLE_PHYSDEV(d) )
+ +            return domain_iomem_in_pfn(d, pfn);
+ +
+ +        MEM_LOG("Non-privileged attempt to map I/O space %p", pfn);
+ +        return 0;
+ +    }
+ +
+ +    return ((l1v & _PAGE_RW) ?
+ +            get_page_and_type(page, d, PGT_writable_page) :
+ +            get_page(page, d));
+ +}
+ +
+ +
+ +/* NB. Virtual address 'l2e' maps to a machine address within frame 'pfn'. */
+ +static int 
+ +get_page_from_l2e(
+ +    l2_pgentry_t l2e, unsigned long pfn,
+ +    struct domain *d, unsigned long va_idx)
+ +{
+ +    int rc;
+ +
+ +    if ( !(l2_pgentry_val(l2e) & _PAGE_PRESENT) )
+ +        return 1;
+ +
+ +    if ( unlikely((l2_pgentry_val(l2e) & (_PAGE_GLOBAL|_PAGE_PSE))) )
+ +    {
+ +        MEM_LOG("Bad L2 page type settings %04lx",
+ +                l2_pgentry_val(l2e) & (_PAGE_GLOBAL|_PAGE_PSE));
+ +        return 0;
+ +    }
+ +
+ +    rc = get_page_and_type_from_pagenr(
+ +        l2_pgentry_to_pfn(l2e), 
+ +        PGT_l1_page_table | (va_idx<<PGT_va_shift), d);
+ +
+ +    if ( unlikely(!rc) )
+ +        return get_linear_pagetable(l2e, pfn, d);
+ +
+ +    return 1;
+ +}
+ +
+ +
+ +static void put_page_from_l1e(l1_pgentry_t l1e, struct domain *d)
+ +{
+ +    unsigned long    l1v  = l1_pgentry_val(l1e);
+ +    unsigned long    pfn  = l1_pgentry_to_pfn(l1e);
+ +    struct pfn_info *page = &frame_table[pfn];
+ +    struct domain   *e;
+ +
+ +    if ( !(l1v & _PAGE_PRESENT) || !pfn_is_ram(pfn) )
+ +        return;
+ +
+ +    e = page_get_owner(page);
+ +    if ( unlikely(e != d) )
+ +    {
+ +        /*
+ +         * Unmap a foreign page that may have been mapped via a grant table.
+ +         * Note that this can fail for a privileged domain that can map foreign
+ +         * pages via MMUEXT_SET_FOREIGNDOM. Such domains can have some mappings
+ +         * counted via a grant entry and some counted directly in the page
+ +         * structure's reference count. Note that reference counts won't get
+ +         * dangerously confused as long as we always try to decrement the
+ +         * grant entry first. We may end up with a mismatch between which
+ +         * mappings and which unmappings are counted via the grant entry, but
+ +         * really it doesn't matter as privileged domains have carte blanche.
+ +         */
+ +        if ( likely(gnttab_check_unmap(e, d, pfn, !(l1v & _PAGE_RW))) )
+ +            return;
+ +        /* Assume this mapping was made via MMUEXT_SET_FOREIGNDOM... */
+ +    }
+ +
+ +    if ( l1v & _PAGE_RW )
+ +    {
+ +        put_page_and_type(page);
+ +    }
+ +    else
+ +    {
+ +        /* We expect this is rare so we blow the entire shadow LDT. */
+ +        if ( unlikely(((page->u.inuse.type_info & PGT_type_mask) == 
+ +                       PGT_ldt_page)) &&
+ +             unlikely(((page->u.inuse.type_info & PGT_count_mask) != 0)) )
+ +            invalidate_shadow_ldt(e->exec_domain[0]);
+ +        put_page(page);
+ +    }
+ +}
+ +
+ +
+ +/*
+ + * NB. Virtual address 'l2e' maps to a machine address within frame 'pfn'.
+ + * Note also that this automatically deals correctly with linear p.t.'s.
+ + */
+ +static void put_page_from_l2e(l2_pgentry_t l2e, unsigned long pfn)
+ +{
+ +    if ( (l2_pgentry_val(l2e) & _PAGE_PRESENT) && 
+ +         ((l2_pgentry_val(l2e) >> PAGE_SHIFT) != pfn) )
+ +        put_page_and_type(&frame_table[l2_pgentry_to_pfn(l2e)]);
+ +}
+ +
+ +
+ +static int alloc_l2_table(struct pfn_info *page)
+ +{
+ +    struct domain *d = page_get_owner(page);
+ +    unsigned long  page_nr = page_to_pfn(page);
+ +    l2_pgentry_t  *pl2e;
+ +    int            i;
+ +   
+ +    pl2e = map_domain_mem(page_nr << PAGE_SHIFT);
+ +
+ +    for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ )
+ +        if ( unlikely(!get_page_from_l2e(pl2e[i], page_nr, d, i)) )
+ +            goto fail;
+ +
+ +#if defined(__i386__)
+ +    /* Now we add our private high mappings. */
+ +    memcpy(&pl2e[DOMAIN_ENTRIES_PER_L2_PAGETABLE], 
+ +           &idle_pg_table[DOMAIN_ENTRIES_PER_L2_PAGETABLE],
+ +           HYPERVISOR_ENTRIES_PER_L2_PAGETABLE * sizeof(l2_pgentry_t));
+ +    pl2e[LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT] =
+ +        mk_l2_pgentry((page_nr << PAGE_SHIFT) | __PAGE_HYPERVISOR);
+ +    pl2e[PERDOMAIN_VIRT_START >> L2_PAGETABLE_SHIFT] =
+ +        mk_l2_pgentry(__pa(page_get_owner(page)->arch.mm_perdomain_pt) | 
+ +                      __PAGE_HYPERVISOR);
+ +#endif
+ +
+ +    unmap_domain_mem(pl2e);
+ +    return 1;
+ +
+ + fail:
+ +    while ( i-- > 0 )
+ +        put_page_from_l2e(pl2e[i], page_nr);
+ +
+ +    unmap_domain_mem(pl2e);
+ +    return 0;
+ +}
+ +
+ +
+ +static int alloc_l1_table(struct pfn_info *page)
+ +{
+ +    struct domain *d = page_get_owner(page);
+ +    unsigned long  page_nr = page_to_pfn(page);
+ +    l1_pgentry_t  *pl1e;
+ +    int            i;
+ +
+ +    pl1e = map_domain_mem(page_nr << PAGE_SHIFT);
+ +
+ +    for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ )
+ +        if ( unlikely(!get_page_from_l1e(pl1e[i], d)) )
+ +            goto fail;
+ +
+ +    unmap_domain_mem(pl1e);
+ +    return 1;
+ +
+ + fail:
+ +    while ( i-- > 0 )
+ +        put_page_from_l1e(pl1e[i], d);
+ +
+ +    unmap_domain_mem(pl1e);
+ +    return 0;
+ +}
+ +
+ +
+ +static void free_l2_table(struct pfn_info *page)
+ +{
+ +    unsigned long page_nr = page - frame_table;
+ +    l2_pgentry_t *pl2e;
+ +    int i;
+ +
+ +    pl2e = map_domain_mem(page_nr << PAGE_SHIFT);
+ +
+ +    for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ )
+ +        put_page_from_l2e(pl2e[i], page_nr);
+ +
+ +    unmap_domain_mem(pl2e);
+ +}
+ +
+ +
+ +static void free_l1_table(struct pfn_info *page)
+ +{
+ +    struct domain *d = page_get_owner(page);
+ +    unsigned long page_nr = page - frame_table;
+ +    l1_pgentry_t *pl1e;
+ +    int i;
+ +
+ +    pl1e = map_domain_mem(page_nr << PAGE_SHIFT);
+ +
+ +    for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ )
+ +        put_page_from_l1e(pl1e[i], d);
+ +
+ +    unmap_domain_mem(pl1e);
+ +}
+ +
+ +
+ +static inline int update_l2e(l2_pgentry_t *pl2e, 
+ +                             l2_pgentry_t  ol2e, 
+ +                             l2_pgentry_t  nl2e)
+ +{
+ +    unsigned long o = cmpxchg((unsigned long *)pl2e, 
+ +                              l2_pgentry_val(ol2e), 
+ +                              l2_pgentry_val(nl2e));
+ +    if ( o != l2_pgentry_val(ol2e) )
+ +        MEM_LOG("Failed to update %p -> %p: saw %p\n",
+ +                l2_pgentry_val(ol2e), l2_pgentry_val(nl2e), o);
+ +    return (o == l2_pgentry_val(ol2e));
+ +}
+ +
+ +
+ +/* Update the L2 entry at pl2e to new value nl2e. pl2e is within frame pfn. */
+ +static int mod_l2_entry(l2_pgentry_t *pl2e, 
+ +                        l2_pgentry_t nl2e, 
+ +                        unsigned long pfn)
+ +{
+ +    l2_pgentry_t ol2e;
+ +    unsigned long _ol2e;
+ +
+ +    if ( unlikely((((unsigned long)pl2e & (PAGE_SIZE-1)) >> 2) >=
+ +                  DOMAIN_ENTRIES_PER_L2_PAGETABLE) )
+ +    {
+ +        MEM_LOG("Illegal L2 update attempt in Xen-private area %p", pl2e);
+ +        return 0;
+ +    }
+ +
+ +    if ( unlikely(__get_user(_ol2e, (unsigned long *)pl2e) != 0) )
+ +        return 0;
+ +    ol2e = mk_l2_pgentry(_ol2e);
+ +
+ +    if ( l2_pgentry_val(nl2e) & _PAGE_PRESENT )
+ +    {
+ +        /* Differ in mapping (bits 12-31) or presence (bit 0)? */
+ +        if ( ((l2_pgentry_val(ol2e) ^ l2_pgentry_val(nl2e)) & ~0xffe) == 0 )
+ +            return update_l2e(pl2e, ol2e, nl2e);
+ +
+ +        if ( unlikely(!get_page_from_l2e(nl2e, pfn, current->domain,
+ +                                        ((unsigned long)pl2e & 
+ +                                         ~PAGE_MASK) >> 2)) )
+ +            return 0;
+ +
+ +        if ( unlikely(!update_l2e(pl2e, ol2e, nl2e)) )
+ +        {
+ +            put_page_from_l2e(nl2e, pfn);
+ +            return 0;
+ +        }
+ +        
+ +        put_page_from_l2e(ol2e, pfn);
+ +        return 1;
+ +    }
+ +
+ +    if ( unlikely(!update_l2e(pl2e, ol2e, nl2e)) )
+ +        return 0;
+ +
+ +    put_page_from_l2e(ol2e, pfn);
+ +    return 1;
+ +}
+ +
+ +
+ +static inline int update_l1e(l1_pgentry_t *pl1e, 
+ +                             l1_pgentry_t  ol1e, 
+ +                             l1_pgentry_t  nl1e)
+ +{
+ +    unsigned long o = l1_pgentry_val(ol1e);
+ +    unsigned long n = l1_pgentry_val(nl1e);
+ +
+ +    if ( unlikely(cmpxchg_user(pl1e, o, n) != 0) ||
+ +         unlikely(o != l1_pgentry_val(ol1e)) )
+ +    {
+ +        MEM_LOG("Failed to update %p -> %p: saw %p\n",
+ +                l1_pgentry_val(ol1e), l1_pgentry_val(nl1e), o);
+ +        return 0;
+ +    }
+ +
+ +    return 1;
+ +}
+ +
+ +
+ +/* Update the L1 entry at pl1e to new value nl1e. */
+ +static int mod_l1_entry(l1_pgentry_t *pl1e, l1_pgentry_t nl1e)
+ +{
+ +    l1_pgentry_t ol1e;
+ +    unsigned long _ol1e;
+ +    struct domain *d = current->domain;
+ +
+ +    if ( unlikely(__get_user(_ol1e, (unsigned long *)pl1e) != 0) )
+ +    {
+ +        MEM_LOG("Bad get_user\n");
+ +        return 0;
+ +    }
+ +    
+ +    ol1e = mk_l1_pgentry(_ol1e);
+ +
+ +    if ( l1_pgentry_val(nl1e) & _PAGE_PRESENT )
+ +    {
+ +        /* Same mapping (bits 12-31), r/w (bit 1), and presence (bit 0)? */
+ +        if ( ((l1_pgentry_val(ol1e) ^ l1_pgentry_val(nl1e)) & ~0xffc) == 0 )
+ +            return update_l1e(pl1e, ol1e, nl1e);
+ +
+ +        if ( unlikely(!get_page_from_l1e(nl1e, FOREIGNDOM)) )
+ +            return 0;
+ +        
+ +        if ( unlikely(!update_l1e(pl1e, ol1e, nl1e)) )
+ +        {
+ +            put_page_from_l1e(nl1e, d);
+ +            return 0;
+ +        }
+ +        
+ +        put_page_from_l1e(ol1e, d);
+ +        return 1;
+ +    }
+ +
+ +    if ( unlikely(!update_l1e(pl1e, ol1e, nl1e)) )
+ +        return 0;
+ +    
+ +    put_page_from_l1e(ol1e, d);
+ +    return 1;
+ +}
+ +
+ +
+ +int alloc_page_type(struct pfn_info *page, unsigned int type)
+ +{
+ +    switch ( type )
+ +    {
+ +    case PGT_l1_page_table:
+ +        return alloc_l1_table(page);
+ +    case PGT_l2_page_table:
+ +        return alloc_l2_table(page);
+ +    case PGT_gdt_page:
+ +    case PGT_ldt_page:
+ +        return alloc_segdesc_page(page);
+ +    default:
+ +        printk("Bad type in alloc_page_type %x t=%x c=%x\n", 
+ +               type, page->u.inuse.type_info,
+ +               page->count_info);
+ +        BUG();
+ +    }
+ +
+ +    return 0;
+ +}
+ +
+ +
+ +void free_page_type(struct pfn_info *page, unsigned int type)
+ +{
+ +    struct domain *d = page_get_owner(page);
+ +
+ +    switch ( type )
+ +    {
+ +    case PGT_l1_page_table:
+ +        free_l1_table(page);
+ +        break;
+ +
+ +    case PGT_l2_page_table:
+ +        free_l2_table(page);
+ +        break;
+ +
+ +    default:
+ +        BUG();
+ +    }
+ +
+ +    if ( unlikely(shadow_mode(d)) && 
+ +         (get_shadow_status(d, page_to_pfn(page)) & PSH_shadowed) )
+ +    {
+ +        unshadow_table(page_to_pfn(page), type);
+ +        put_shadow_status(d);
+ +    }
+ +}
+ +
+ +
+ +void put_page_type(struct pfn_info *page)
+ +{
+ +    u32 nx, x, y = page->u.inuse.type_info;
+ +
+ + again:
+ +    do {
+ +        x  = y;
+ +        nx = x - 1;
+ +
+ +        ASSERT((x & PGT_count_mask) != 0);
+ +
+ +        /*
+ +         * The page should always be validated while a reference is held. The 
+ +         * exception is during domain destruction, when we forcibly invalidate 
+ +         * page-table pages if we detect a referential loop.
+ +         * See domain.c:relinquish_list().
+ +         */
+ +        ASSERT((x & PGT_validated) || 
+ +               test_bit(DF_DYING, &page_get_owner(page)->d_flags));
+ +
+ +        if ( unlikely((nx & PGT_count_mask) == 0) )
+ +        {
+ +            /* Record TLB information for flush later. Races are harmless. */
+ +            page->tlbflush_timestamp = tlbflush_current_time();
+ +            
+ +            if ( unlikely((nx & PGT_type_mask) <= PGT_l4_page_table) &&
+ +                 likely(nx & PGT_validated) )
+ +            {
+ +                /*
+ +                 * Page-table pages must be unvalidated when count is zero. The
+ +                 * 'free' is safe because the refcnt is non-zero and validated
+ +                 * bit is clear => other ops will spin or fail.
+ +                 */
+ +                if ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x, 
+ +                                           x & ~PGT_validated)) != x) )
+ +                    goto again;
+ +                /* We cleared the 'valid bit' so we do the clear up. */
+ +                free_page_type(page, x & PGT_type_mask);
+ +                /* Carry on, but with the 'valid bit' now clear. */
+ +                x  &= ~PGT_validated;
+ +                nx &= ~PGT_validated;
+ +            }
+ +        }
+ +        else if ( unlikely((nx & (PGT_pinned | PGT_count_mask)) == 
+ +                           (PGT_pinned | 1)) )
+ +        {
+ +            /* Page is now only pinned. Make the back pointer mutable again. */
+ +            nx |= PGT_va_mutable;
+ +        }
+ +    }
+ +    while ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) != x) );
+ +}
+ +
+ +
+ +int get_page_type(struct pfn_info *page, u32 type)
+ +{
+ +    u32 nx, x, y = page->u.inuse.type_info;
+ +
+ + again:
+ +    do {
+ +        x  = y;
+ +        nx = x + 1;
+ +        if ( unlikely((nx & PGT_count_mask) == 0) )
+ +        {
+ +            MEM_LOG("Type count overflow on pfn %p\n", page_to_pfn(page));
+ +            return 0;
+ +        }
+ +        else if ( unlikely((x & PGT_count_mask) == 0) )
+ +        {
+ +            if ( (x & (PGT_type_mask|PGT_va_mask)) != type )
+ +            {
+ +                /*
+ +                 * On type change we check to flush stale TLB entries. This 
+ +                 * may be unnecessary (e.g., page was GDT/LDT) but those
+ +                 * circumstances should be very rare.
+ +                 */
+ +                struct domain *d = page_get_owner(page);
+ +                if ( unlikely(NEED_FLUSH(tlbflush_time[d->exec_domain[0]->processor],
+ +                                         page->tlbflush_timestamp)) )
+ +                {
+ +                    perfc_incr(need_flush_tlb_flush);
+ +                    flush_tlb_cpu(d->exec_domain[0]->processor);
+ +                }
+ +
+ +                /* We lose existing type, back pointer, and validity. */
+ +                nx &= ~(PGT_type_mask | PGT_va_mask | PGT_validated);
+ +                nx |= type;
+ +
+ +                /* No special validation needed for writable pages. */
+ +                /* Page tables and GDT/LDT need to be scanned for validity. */
+ +                if ( type == PGT_writable_page )
+ +                    nx |= PGT_validated;
+ +            }
+ +        }
+ +        else if ( unlikely((x & (PGT_type_mask|PGT_va_mask)) != type) )
+ +        {
+ +            if ( unlikely((x & PGT_type_mask) != (type & PGT_type_mask) ) )
+ +            {
+ +                if ( ((x & PGT_type_mask) != PGT_l2_page_table) ||
+ +                     ((type & PGT_type_mask) != PGT_l1_page_table) )
+ +                    MEM_LOG("Bad type (saw %08x != exp %08x) for pfn %p\n",
+ +                            x & PGT_type_mask, type, page_to_pfn(page));
+ +                return 0;
+ +            }
+ +            else if ( (x & PGT_va_mask) == PGT_va_mutable )
+ +            {
+ +                /* The va backpointer is mutable, hence we update it. */
+ +                nx &= ~PGT_va_mask;
+ +                nx |= type; /* we know the actual type is correct */
+ +            }
+ +            else if ( unlikely((x & PGT_va_mask) != (type & PGT_va_mask)) )
+ +            {
+ +                /* This table is potentially mapped at multiple locations. */
+ +                nx &= ~PGT_va_mask;
+ +                nx |= PGT_va_unknown;
+ +            }
+ +        }
+ +        else if ( unlikely(!(x & PGT_validated)) )
+ +        {
+ +            /* Someone else is updating validation of this page. Wait... */
+ +            while ( (y = page->u.inuse.type_info) == x )
+ +            {
+ +                rep_nop();
+ +                barrier();
+ +            }
+ +            goto again;
+ +        }
+ +    }
+ +    while ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) != x) );
+ +
+ +    if ( unlikely(!(nx & PGT_validated)) )
+ +    {
+ +        /* Try to validate page type; drop the new reference on failure. */
+ +        if ( unlikely(!alloc_page_type(page, type & PGT_type_mask)) )
+ +        {
+ +            MEM_LOG("Error while validating pfn %p for type %08x."
+ +                    " caf=%08x taf=%08x\n",
+ +                    page_to_pfn(page), type,
+ +                    page->count_info,
+ +                    page->u.inuse.type_info);
+ +            /* Noone else can get a reference. We hold the only ref. */
+ +            page->u.inuse.type_info = 0;
+ +            return 0;
+ +        }
+ +
+ +        /* Noone else is updating simultaneously. */
+ +        __set_bit(_PGT_validated, &page->u.inuse.type_info);
+ +    }
+ +
+ +    return 1;
+ +}
+ +
+ +
+ +int new_guest_cr3(unsigned long pfn)
+ +{
+ +    struct exec_domain *ed = current;
+ +    struct domain *d = ed->domain;
+ +    int okay, cpu = smp_processor_id();
+ +    unsigned long old_base_pfn;
+ +    
+ +    okay = get_page_and_type_from_pagenr(pfn, PGT_l2_page_table, d);
+ +    if ( likely(okay) )
+ +    {
+ +        invalidate_shadow_ldt(ed);
+ +
+ +        percpu_info[cpu].deferred_ops &= ~DOP_FLUSH_TLB;
+ +        old_base_pfn = pagetable_val(ed->arch.pagetable) >> PAGE_SHIFT;
+ +        ed->arch.pagetable = mk_pagetable(pfn << PAGE_SHIFT);
+ +
+ +        shadow_mk_pagetable(ed);
+ +
+ +        write_ptbase(ed);
+ +
+ +        put_page_and_type(&frame_table[old_base_pfn]);
+ +    }
+ +    else
+ +    {
+ +        MEM_LOG("Error while installing new baseptr %p", pfn);
+ +    }
+ +
+ +    return okay;
+ +}
+ +
+ +static int do_extended_command(unsigned long ptr, unsigned long val)
+ +{
+ +    int okay = 1, cpu = smp_processor_id();
+ +    unsigned int cmd = val & MMUEXT_CMD_MASK;
+ +    unsigned long pfn = ptr >> PAGE_SHIFT;
+ +    struct pfn_info *page = &frame_table[pfn];
+ +    struct exec_domain *ed = current;
+ +    struct domain *d = ed->domain, *nd, *e;
+ +    u32 x, y;
+ +    domid_t domid;
+ +    grant_ref_t gntref;
+ +
+ +    switch ( cmd )
+ +    {
+ +    case MMUEXT_PIN_L1_TABLE:
+ +    case MMUEXT_PIN_L2_TABLE:
+ +        /*
+ +         * We insist that, if you pin an L1 page, it's the first thing that
+ +         * you do to it. This is because we require the backptr to still be
+ +         * mutable. This assumption seems safe.
+ +         */
+ +        okay = get_page_and_type_from_pagenr(
+ +            pfn, 
+ +            ((cmd==MMUEXT_PIN_L2_TABLE) ? 
+ +             PGT_l2_page_table : (PGT_l1_page_table|PGT_va_mutable)),
+ +            FOREIGNDOM);
+ +
+ +        if ( unlikely(!okay) )
+ +        {
+ +            MEM_LOG("Error while pinning pfn %p", pfn);
+ +            break;
+ +        }
+ +
+ +        if ( unlikely(test_and_set_bit(_PGT_pinned,
+ +                                       &page->u.inuse.type_info)) )
+ +        {
+ +            MEM_LOG("Pfn %p already pinned", pfn);
+ +            put_page_and_type(page);
+ +            okay = 0;
+ +            break;
+ +        }
+ +
+ +        break;
+ +
+ +    case MMUEXT_UNPIN_TABLE:
+ +        if ( unlikely(!(okay = get_page_from_pagenr(pfn, FOREIGNDOM))) )
+ +        {
+ +            MEM_LOG("Page %p bad domain (dom=%p)",
+ +                    ptr, page_get_owner(page));
+ +        }
+ +        else if ( likely(test_and_clear_bit(_PGT_pinned, 
+ +                                            &page->u.inuse.type_info)) )
+ +        {
+ +            put_page_and_type(page);
+ +            put_page(page);
+ +        }
+ +        else
+ +        {
+ +            okay = 0;
+ +            put_page(page);
+ +            MEM_LOG("Pfn %p not pinned", pfn);
+ +        }
+ +        break;
+ +
+ +    case MMUEXT_NEW_BASEPTR:
+ +        okay = new_guest_cr3(pfn);
+ +        break;
+ +        
+ +    case MMUEXT_TLB_FLUSH:
+ +        percpu_info[cpu].deferred_ops |= DOP_FLUSH_TLB;
+ +        break;
+ +    
+ +    case MMUEXT_INVLPG:
+ +        __flush_tlb_one(ptr);
+ +        break;
+ +
+ +    case MMUEXT_FLUSH_CACHE:
+ +        if ( unlikely(!IS_CAPABLE_PHYSDEV(d)) )
+ +        {
+ +            MEM_LOG("Non-physdev domain tried to FLUSH_CACHE.\n");
+ +            okay = 0;
+ +        }
+ +        else
+ +        {
+ +            wbinvd();
+ +        }
+ +        break;
+ +
+ +    case MMUEXT_SET_LDT:
+ +    {
+ +        unsigned long ents = val >> MMUEXT_CMD_SHIFT;
+ +        if ( ((ptr & (PAGE_SIZE-1)) != 0) || 
+ +             (ents > 8192) ||
+ +             ((ptr+ents*LDT_ENTRY_SIZE) < ptr) ||
+ +             ((ptr+ents*LDT_ENTRY_SIZE) > PAGE_OFFSET) )
+ +        {
+ +            okay = 0;
+ +            MEM_LOG("Bad args to SET_LDT: ptr=%p, ents=%p", ptr, ents);
+ +        }
+ +        else if ( (ed->arch.ldt_ents != ents) || 
+ +                  (ed->arch.ldt_base != ptr) )
+ +        {
+ +            invalidate_shadow_ldt(ed);
+ +            ed->arch.ldt_base = ptr;
+ +            ed->arch.ldt_ents = ents;
+ +            load_LDT(ed);
+ +            percpu_info[cpu].deferred_ops &= ~DOP_RELOAD_LDT;
+ +            if ( ents != 0 )
+ +                percpu_info[cpu].deferred_ops |= DOP_RELOAD_LDT;
+ +        }
+ +        break;
+ +    }
+ +
+ +    case MMUEXT_SET_FOREIGNDOM:
+ +        domid = (domid_t)(val >> 16);
+ +
+ +        if ( (e = percpu_info[cpu].foreign) != NULL )
+ +            put_domain(e);
+ +        percpu_info[cpu].foreign = NULL;
+ +
+ +        if ( !IS_PRIV(d) )
+ +        {
+ +            switch ( domid )
+ +            {
+ +            case DOMID_IO:
+ +                get_knownalive_domain(dom_io);
+ +                percpu_info[cpu].foreign = dom_io;
+ +                break;
+ +            default:
+ +                MEM_LOG("Dom %u cannot set foreign dom\n", d->id);
+ +                okay = 0;
+ +                break;
+ +            }
+ +        }
+ +        else
+ +        {
+ +            percpu_info[cpu].foreign = e = find_domain_by_id(domid);
+ +            if ( e == NULL )
+ +            {
+ +                switch ( domid )
+ +                {
+ +                case DOMID_XEN:
+ +                    get_knownalive_domain(dom_xen);
+ +                    percpu_info[cpu].foreign = dom_xen;
+ +                    break;
+ +                case DOMID_IO:
+ +                    get_knownalive_domain(dom_io);
+ +                    percpu_info[cpu].foreign = dom_io;
+ +                    break;
+ +                default:
+ +                    MEM_LOG("Unknown domain '%u'", domid);
+ +                    okay = 0;
+ +                    break;
+ +                }
+ +            }
+ +        }
+ +        break;
+ +
+ +    case MMUEXT_TRANSFER_PAGE:
+ +        domid  = (domid_t)(val >> 16);
+ +        gntref = (grant_ref_t)((val & 0xFF00) | ((ptr >> 2) & 0x00FF));
+ +        
+ +        if ( unlikely(IS_XEN_HEAP_FRAME(page)) ||
+ +             unlikely(!pfn_is_ram(pfn)) ||
+ +             unlikely((e = find_domain_by_id(domid)) == NULL) )
+ +        {
+ +            MEM_LOG("Bad frame (%p) or bad domid (%d).\n", pfn, domid);
+ +            okay = 0;
+ +            break;
+ +        }
+ +
+ +        spin_lock(&d->page_alloc_lock);
+ +
+ +        /*
+ +         * The tricky bit: atomically release ownership while there is just one
+ +         * benign reference to the page (PGC_allocated). If that reference
+ +         * disappears then the deallocation routine will safely spin.
+ +         */
+ +        nd = page_get_owner(page);
+ +        y  = page->count_info;
+ +        do {
+ +            x = y;
+ +            if ( unlikely((x & (PGC_count_mask|PGC_allocated)) != 
+ +                          (1|PGC_allocated)) ||
+ +                 unlikely(nd != d) )
+ +            {
+ +                MEM_LOG("Bad page values %p: ed=%p(%u), sd=%p,"
+ +                        " caf=%08x, taf=%08x\n", page_to_pfn(page),
+ +                        d, d->id, nd, x, page->u.inuse.type_info);
+ +                spin_unlock(&d->page_alloc_lock);
+ +                put_domain(e);
+ +                return 0;
+ +            }
+ +            __asm__ __volatile__(
+ +                LOCK_PREFIX "cmpxchg8b %2"
+ +                : "=d" (nd), "=a" (y),
+ +                "=m" (*(volatile u64 *)(&page->count_info))
+ +                : "0" (d), "1" (x), "c" (NULL), "b" (x) );
+ +        } 
+ +        while ( unlikely(nd != d) || unlikely(y != x) );
+ +
+ +        /*
+ +         * Unlink from 'd'. At least one reference remains (now anonymous), so
+ +         * noone else is spinning to try to delete this page from 'd'.
+ +         */
+ +        d->tot_pages--;
+ +        list_del(&page->list);
+ +        
+ +        spin_unlock(&d->page_alloc_lock);
+ +
+ +        spin_lock(&e->page_alloc_lock);
+ +
+ +        /*
+ +         * Check that 'e' will accept the page and has reservation headroom.
+ +         * Also, a domain mustn't have PGC_allocated pages when it is dying.
+ +         */
+ +        ASSERT(e->tot_pages <= e->max_pages);
+ +        if ( unlikely(test_bit(DF_DYING, &e->d_flags)) ||
+ +             unlikely(e->tot_pages == e->max_pages) ||
+ +             unlikely(!gnttab_prepare_for_transfer(e, d, gntref)) )
+ +        {
+ +            MEM_LOG("Transferee has no reservation headroom (%d,%d), or "
+ +                    "provided a bad grant ref, or is dying (%p).\n",
+ +                    e->tot_pages, e->max_pages, e->d_flags);
+ +            spin_unlock(&e->page_alloc_lock);
+ +            put_domain(e);
+ +            okay = 0;
+ +            break;
+ +        }
+ +
+ +        /* Okay, add the page to 'e'. */
+ +        if ( unlikely(e->tot_pages++ == 0) )
+ +            get_knownalive_domain(e);
+ +        list_add_tail(&page->list, &e->page_list);
+ +        page_set_owner(page, e);
+ +
+ +        spin_unlock(&e->page_alloc_lock);
+ +
+ +        /* Transfer is all done: tell the guest about its new page frame. */
+ +        gnttab_notify_transfer(e, gntref, pfn);
+ +        
+ +        put_domain(e);
+ +        break;
+ +
+ +    case MMUEXT_REASSIGN_PAGE:
+ +        if ( unlikely(!IS_PRIV(d)) )
+ +        {
+ +            MEM_LOG("Dom %u has no reassignment priv", d->id);
+ +            okay = 0;
+ +            break;
+ +        }
+ +
+ +        e = percpu_info[cpu].foreign;
+ +        if ( unlikely(e == NULL) )
+ +        {
+ +            MEM_LOG("No FOREIGNDOM to reassign pfn %p to", pfn);
+ +            okay = 0;
+ +            break;
+ +        }
+ +
+ +        /*
+ +         * Grab both page_list locks, in order. This prevents the page from
+ +         * disappearing elsewhere while we modify the owner, and we'll need
+ +         * both locks if we're successful so that we can change lists.
+ +         */
+ +        if ( d < e )
+ +        {
+ +            spin_lock(&d->page_alloc_lock);
+ +            spin_lock(&e->page_alloc_lock);
+ +        }
+ +        else
+ +        {
+ +            spin_lock(&e->page_alloc_lock);
+ +            spin_lock(&d->page_alloc_lock);
+ +        }
+ +
+ +        /* A domain shouldn't have PGC_allocated pages when it is dying. */
+ +        if ( unlikely(test_bit(DF_DYING, &e->d_flags)) ||
+ +             unlikely(IS_XEN_HEAP_FRAME(page)) )
+ +        {
+ +            MEM_LOG("Reassignment page is Xen heap, or dest dom is dying.");
+ +            okay = 0;
+ +            goto reassign_fail;
+ +        }
+ +
+ +        /*
+ +         * The tricky bit: atomically change owner while there is just one
+ +         * benign reference to the page (PGC_allocated). If that reference
+ +         * disappears then the deallocation routine will safely spin.
+ +         */
+ +        nd = page_get_owner(page);
+ +        y  = page->count_info;
+ +        do {
+ +            x = y;
+ +            if ( unlikely((x & (PGC_count_mask|PGC_allocated)) != 
+ +                          (1|PGC_allocated)) ||
+ +                 unlikely(nd != d) )
+ +            {
+ +                MEM_LOG("Bad page values %p: ed=%p(%u), sd=%p,"
+ +                        " caf=%08x, taf=%08x\n", page_to_pfn(page),
+ +                        d, d->id, nd, x, page->u.inuse.type_info);
+ +                okay = 0;
+ +                goto reassign_fail;
+ +            }
+ +            __asm__ __volatile__(
+ +                LOCK_PREFIX "cmpxchg8b %3"
+ +                : "=d" (nd), "=a" (y), "=c" (e),
+ +                "=m" (*(volatile u64 *)(&page->count_info))
+ +                : "0" (d), "1" (x), "c" (e), "b" (x) );
+ +        } 
+ +        while ( unlikely(nd != d) || unlikely(y != x) );
+ +        
+ +        /*
+ +         * Unlink from 'd'. We transferred at least one reference to 'e', so
+ +         * noone else is spinning to try to delete this page from 'd'.
+ +         */
+ +        d->tot_pages--;
+ +        list_del(&page->list);
+ +        
+ +        /*
+ +         * Add the page to 'e'. Someone may already have removed the last
+ +         * reference and want to remove the page from 'e'. However, we have
+ +         * the lock so they'll spin waiting for us.
+ +         */
+ +        if ( unlikely(e->tot_pages++ == 0) )
+ +            get_knownalive_domain(e);
+ +        list_add_tail(&page->list, &e->page_list);
+ +
+ +    reassign_fail:        
+ +        spin_unlock(&d->page_alloc_lock);
+ +        spin_unlock(&e->page_alloc_lock);
+ +        break;
+ +
+ +    case MMUEXT_CLEAR_FOREIGNDOM:
+ +        if ( (e = percpu_info[cpu].foreign) != NULL )
+ +            put_domain(e);
+ +        percpu_info[cpu].foreign = NULL;
+ +        break;
+ +
+ +    default:
+ +        MEM_LOG("Invalid extended pt command 0x%p", val & MMUEXT_CMD_MASK);
+ +        okay = 0;
+ +        break;
+ +    }
+ +
+ +    return okay;
+ +}
+ +
+ +int do_mmu_update(
+ +    mmu_update_t *ureqs, unsigned int count, unsigned int *pdone)
+ +{
+ +/*
+ + * We steal the m.s.b. of the @count parameter to indicate whether this
+ + * invocation of do_mmu_update() is resuming a previously preempted call.
+ + * We steal the next 15 bits to remember the current FOREIGNDOM.
+ + */
+ +#define MMU_UPDATE_PREEMPTED          (~(~0U>>1))
+ +#define MMU_UPDATE_PREEMPT_FDOM_SHIFT ((sizeof(int)*8)-16)
+ +#define MMU_UPDATE_PREEMPT_FDOM_MASK  (0x7FFFU<<MMU_UPDATE_PREEMPT_FDOM_SHIFT)
+ +
+ +    mmu_update_t req;
+ +    unsigned long va = 0, deferred_ops, pfn, prev_pfn = 0;
+ +    struct pfn_info *page;
+ +    int rc = 0, okay = 1, i = 0, cpu = smp_processor_id();
+ +    unsigned int cmd, done = 0;
+ +    unsigned long prev_smfn = 0;
+ +    l1_pgentry_t *prev_spl1e = 0;
+ +    struct exec_domain *ed = current;
+ +    struct domain *d = ed->domain;
+ +    u32 type_info;
+ +    domid_t domid;
+ +
+ +    LOCK_BIGLOCK(d);
+ +
+ +    cleanup_writable_pagetable(d);
+ +
+ +    if ( unlikely(shadow_mode(d)) )
+ +        check_pagetable(d, ed->arch.pagetable, "pre-mmu"); /* debug */
+ +
+ +    /*
+ +     * If we are resuming after preemption, read how much work we have already
+ +     * done. This allows us to set the @done output parameter correctly.
+ +     * We also reset FOREIGNDOM here.
+ +     */
+ +    if ( unlikely(count&(MMU_UPDATE_PREEMPTED|MMU_UPDATE_PREEMPT_FDOM_MASK)) )
+ +    {
+ +        if ( !(count & MMU_UPDATE_PREEMPTED) )
+ +        {
+ +            /* Count overflow into private FOREIGNDOM field. */
+ +            MEM_LOG("do_mmu_update count is too large");
+ +            rc = -EINVAL;
+ +            goto out;
+ +        }
+ +        count &= ~MMU_UPDATE_PREEMPTED;
+ +        domid = count >> MMU_UPDATE_PREEMPT_FDOM_SHIFT;
+ +        count &= ~MMU_UPDATE_PREEMPT_FDOM_MASK;
+ +        if ( unlikely(pdone != NULL) )
+ +            (void)get_user(done, pdone);
+ +        if ( (domid != current->domain->id) &&
+ +             !do_extended_command(0, MMUEXT_SET_FOREIGNDOM | (domid << 16)) )
+ +        {
+ +            rc = -EINVAL;
+ +            goto out;
+ +        }
+ +    }
+ +
+ +    perfc_incrc(calls_to_mmu_update); 
+ +    perfc_addc(num_page_updates, count);
+ +
+ +    if ( unlikely(!array_access_ok(VERIFY_READ, ureqs, count, sizeof(req))) )
+ +    {
+ +        rc = -EFAULT;
+ +        goto out;
+ +    }
+ +
+ +    for ( i = 0; i < count; i++ )
+ +    {
+ +        if ( hypercall_preempt_check() )
+ +        {
+ +            rc = hypercall3_create_continuation(
+ +                __HYPERVISOR_mmu_update, ureqs, 
+ +                (count - i) |
+ +                (FOREIGNDOM->id << MMU_UPDATE_PREEMPT_FDOM_SHIFT) | 
+ +                MMU_UPDATE_PREEMPTED, pdone);
+ +            break;
+ +        }
+ +
+ +        if ( unlikely(__copy_from_user(&req, ureqs, sizeof(req)) != 0) )
+ +        {
+ +            MEM_LOG("Bad __copy_from_user");
+ +            rc = -EFAULT;
+ +            break;
+ +        }
+ +
+ +        cmd = req.ptr & (sizeof(l1_pgentry_t)-1);
+ +        pfn = req.ptr >> PAGE_SHIFT;
+ +
+ +        okay = 0;
+ +
+ +        switch ( cmd )
+ +        {
+ +            /*
+ +             * MMU_NORMAL_PT_UPDATE: Normal update to any level of page table.
+ +             */
+ +        case MMU_NORMAL_PT_UPDATE:
+ +            if ( unlikely(!get_page_from_pagenr(pfn, current->domain)) )
+ +            {
+ +                MEM_LOG("Could not get page for normal update");
+ +                break;
+ +            }
+ +
+ +            if ( likely(prev_pfn == pfn) )
+ +            {
+ +                va = (va & PAGE_MASK) | (req.ptr & ~PAGE_MASK);
+ +            }
+ +            else
+ +            {
+ +                if ( prev_pfn != 0 )
+ +                    unmap_domain_mem((void *)va);
+ +                va = (unsigned long)map_domain_mem(req.ptr);
+ +                prev_pfn = pfn;
+ +            }
+ +
+ +            page = &frame_table[pfn];
+ +            switch ( (type_info = page->u.inuse.type_info) & PGT_type_mask )
+ +            {
+ +            case PGT_l1_page_table: 
+ +                if ( likely(get_page_type(
+ +                    page, type_info & (PGT_type_mask|PGT_va_mask))) )
+ +                {
+ +                    okay = mod_l1_entry((l1_pgentry_t *)va, 
+ +                                        mk_l1_pgentry(req.val)); 
+ +
+ +                    if ( unlikely(shadow_mode(d)) && okay &&
+ +                         (get_shadow_status(d, page-frame_table) &
+ +                          PSH_shadowed) )
+ +                    {
+ +                        shadow_l1_normal_pt_update(
+ +                            req.ptr, req.val, &prev_smfn, &prev_spl1e);
+ +                        put_shadow_status(d);
+ +                    }
+ +
+ +                    put_page_type(page);
+ +                }
+ +                break;
+ +            case PGT_l2_page_table:
+ +                if ( likely(get_page_type(page, PGT_l2_page_table)) )
+ +                {
+ +                    okay = mod_l2_entry((l2_pgentry_t *)va, 
+ +                                        mk_l2_pgentry(req.val),
+ +                                        pfn); 
+ +
+ +                    if ( unlikely(shadow_mode(d)) && okay &&
+ +                         (get_shadow_status(d, page-frame_table) & 
+ +                          PSH_shadowed) )
+ +                    {
+ +                        shadow_l2_normal_pt_update(req.ptr, req.val);
+ +                        put_shadow_status(d);
+ +                    }
+ +
+ +                    put_page_type(page);
+ +                }
+ +                break;
+ +            default:
+ +                if ( likely(get_page_type(page, PGT_writable_page)) )
+ +                {
+ +                    *(unsigned long *)va = req.val;
+ +                    okay = 1;
+ +                    put_page_type(page);
+ +                }
+ +                break;
+ +            }
+ +
+ +            put_page(page);
+ +            break;
+ +
+ +        case MMU_MACHPHYS_UPDATE:
+ +            if ( unlikely(!get_page_from_pagenr(pfn, FOREIGNDOM)) )
+ +            {
+ +                MEM_LOG("Could not get page for mach->phys update");
+ +                break;
+ +            }
+ +
+ +            machine_to_phys_mapping[pfn] = req.val;
+ +            okay = 1;
+ +
+ +            /*
+ +             * If in log-dirty mode, mark the corresponding pseudo-physical
+ +             * page as dirty.
+ +             */
+ +            if ( unlikely(shadow_mode(d) == SHM_logdirty) && 
+ +                 mark_dirty(d, pfn) )
+ +                d->arch.shadow_dirty_block_count++;
+ +
+ +            put_page(&frame_table[pfn]);
+ +            break;
+ +
+ +            /*
+ +             * MMU_EXTENDED_COMMAND: Extended command is specified
+ +             * in the least-siginificant bits of the 'value' field.
+ +             */
+ +        case MMU_EXTENDED_COMMAND:
+ +            req.ptr &= ~(sizeof(l1_pgentry_t) - 1);
+ +            okay = do_extended_command(req.ptr, req.val);
+ +            break;
+ +
+ +        default:
+ +            MEM_LOG("Invalid page update command %p", req.ptr);
+ +            break;
+ +        }
+ +
+ +        if ( unlikely(!okay) )
+ +        {
+ +            rc = -EINVAL;
+ +            break;
+ +        }
+ +
+ +        ureqs++;
+ +    }
+ +
+ + out:
+ +    if ( prev_pfn != 0 )
+ +        unmap_domain_mem((void *)va);
+ +
+ +    if ( unlikely(prev_spl1e != 0) ) 
+ +        unmap_domain_mem((void *)prev_spl1e);
+ +
+ +    deferred_ops = percpu_info[cpu].deferred_ops;
+ +    percpu_info[cpu].deferred_ops = 0;
+ +
+ +    if ( deferred_ops & DOP_FLUSH_TLB )
+ +        local_flush_tlb();
+ +        
+ +    if ( deferred_ops & DOP_RELOAD_LDT )
+ +        (void)map_ldt_shadow_page(0);
+ +
+ +    if ( unlikely(percpu_info[cpu].foreign != NULL) )
+ +    {
+ +        put_domain(percpu_info[cpu].foreign);
+ +        percpu_info[cpu].foreign = NULL;
+ +    }
+ +
+ +    /* Add incremental work we have done to the @done output parameter. */
+ +    if ( unlikely(pdone != NULL) )
+ +        __put_user(done + i, pdone);
+ +
+ +    if ( unlikely(shadow_mode(d)) )
+ +        check_pagetable(d, ed->arch.pagetable, "post-mmu"); /* debug */
+ +
+ +    UNLOCK_BIGLOCK(d);
+ +    return rc;
+ +}
+ +
+ +
+ +int do_update_va_mapping(unsigned long va,
+ +                         unsigned long val, 
+ +                         unsigned long flags)
+ +{
+ +    struct exec_domain *ed = current;
+ +    struct domain *d = ed->domain;
+ +    int err = 0;
+ +    unsigned int cpu = ed->processor;
+ +    unsigned long deferred_ops;
+ +
+ +    perfc_incrc(calls_to_update_va);
+ +
+ +    if ( unlikely(!__addr_ok(va)) )
+ +        return -EINVAL;
+ +
+ +    LOCK_BIGLOCK(d);
+ +
+ +    cleanup_writable_pagetable(d);
+ +
+ +    /*
+ +     * XXX When we make this support 4MB superpages we should also deal with 
+ +     * the case of updating L2 entries.
+ +     */
+ +
+ +    if ( unlikely(!mod_l1_entry(&linear_pg_table[l1_linear_offset(va)],
+ +                                mk_l1_pgentry(val))) )
+ +        err = -EINVAL;
+ +
+ +    if ( unlikely(shadow_mode(d)) )
+ +    {
+ +        unsigned long sval = 0;
+ +
+ +        l1pte_propagate_from_guest(d, &val, &sval);
+ +
+ +        if ( unlikely(__put_user(sval, ((unsigned long *)(
+ +            &shadow_linear_pg_table[l1_linear_offset(va)])))) )
+ +        {
+ +            /*
-             perfc_incrc(shadow_update_va_fail);
++             * Since L2's are guranteed RW, failure indicates either that the
++             * page was not shadowed, or that the L2 entry has not yet been
++             * updated to reflect the shadow.
+ +             */
-     int i;
++            unsigned l2_idx = page_nr >> (L2_PAGETABLE_SHIFT - L1_PAGETABLE_SHIFT);
++            l2_pgentry_t gpde = linear_l2_table[l2_idx];
++            unsigned long gpfn = l2_pgentry_val(gpde) >> PAGE_SHIFT;
++
++            if (get_shadow_status(&d->mm, gpfn))
++            {
++                unsigned long *gl1e = map_domain_mem(gpfn << PAGE_SHIFT);
++                unsigned l1_idx = page_nr & (ENTRIES_PER_L1_PAGETABLE - 1);
++                gl1e[l1_idx] = sval;
++                unmap_domain_mem(gl1e);
++                put_shadow_status(&d->mm);
++
++                perfc_incrc(shadow_update_va_fail1);
++            }
++            else
++                perfc_incrc(shadow_update_va_fail2);
+ +        }
+ +
+ +        /*
+ +         * If we're in log-dirty mode then we need to note that we've updated
+ +         * the PTE in the PT-holding page. We need the machine frame number
+ +         * for this.
+ +         */
+ +        if ( shadow_mode(d) == SHM_logdirty )
+ +            mark_dirty(d, va_to_l1mfn(va));
+ +  
+ +        check_pagetable(d, ed->arch.pagetable, "va"); /* debug */
+ +    }
+ +
+ +    deferred_ops = percpu_info[cpu].deferred_ops;
+ +    percpu_info[cpu].deferred_ops = 0;
+ +
+ +    if ( unlikely(deferred_ops & DOP_FLUSH_TLB) || 
+ +         unlikely(flags & UVMF_FLUSH_TLB) )
+ +        local_flush_tlb();
+ +    else if ( unlikely(flags & UVMF_INVLPG) )
+ +        __flush_tlb_one(va);
+ +
+ +    if ( unlikely(deferred_ops & DOP_RELOAD_LDT) )
+ +        (void)map_ldt_shadow_page(0);
+ +    
+ +    UNLOCK_BIGLOCK(d);
+ +
+ +    return err;
+ +}
+ +
+ +int do_update_va_mapping_otherdomain(unsigned long va,
+ +                                     unsigned long val, 
+ +                                     unsigned long flags,
+ +                                     domid_t domid)
+ +{
+ +    unsigned int cpu = smp_processor_id();
+ +    struct domain *d;
+ +    int rc;
+ +
+ +    if ( unlikely(!IS_PRIV(current->domain)) )
+ +        return -EPERM;
+ +
+ +    percpu_info[cpu].foreign = d = find_domain_by_id(domid);
+ +    if ( unlikely(d == NULL) )
+ +    {
+ +        MEM_LOG("Unknown domain '%u'", domid);
+ +        return -ESRCH;
+ +    }
+ +
+ +    rc = do_update_va_mapping(va, val, flags);
+ +
+ +    put_domain(d);
+ +    percpu_info[cpu].foreign = NULL;
+ +
+ +    return rc;
+ +}
+ +
+ +
+ +
+ +/*************************
+ + * Descriptor Tables
+ + */
+ +
+ +void destroy_gdt(struct exec_domain *ed)
+ +{
+ +    int i;
+ +    unsigned long pfn;
+ +
+ +    for ( i = 0; i < 16; i++ )
+ +    {
+ +        if ( (pfn = l1_pgentry_to_pfn(ed->arch.perdomain_ptes[i])) != 0 )
+ +            put_page_and_type(&frame_table[pfn]);
+ +        ed->arch.perdomain_ptes[i] = mk_l1_pgentry(0);
+ +    }
+ +}
+ +
+ +
+ +long set_gdt(struct exec_domain *ed, 
+ +             unsigned long *frames,
+ +             unsigned int entries)
+ +{
+ +    struct domain *d = ed->domain;
+ +    /* NB. There are 512 8-byte entries per GDT page. */
+ +    int i = 0, nr_pages = (entries + 511) / 512;
+ +    struct desc_struct *vgdt;
+ +    unsigned long pfn;
+ +
+ +    /* Check the first page in the new GDT. */
+ +    if ( (pfn = frames[0]) >= max_page )
+ +        goto fail;
+ +
+ +    /* The first page is special because Xen owns a range of entries in it. */
+ +    if ( !get_page_and_type(&frame_table[pfn], d, PGT_gdt_page) )
+ +    {
+ +        /* GDT checks failed: try zapping the Xen reserved entries. */
+ +        if ( !get_page_and_type(&frame_table[pfn], d, PGT_writable_page) )
+ +            goto fail;
+ +        vgdt = map_domain_mem(pfn << PAGE_SHIFT);
+ +        memset(vgdt + FIRST_RESERVED_GDT_ENTRY, 0,
+ +               NR_RESERVED_GDT_ENTRIES*8);
+ +        unmap_domain_mem(vgdt);
+ +        put_page_and_type(&frame_table[pfn]);
+ +
+ +        /* Okay, we zapped the entries. Now try the GDT checks again. */
+ +        if ( !get_page_and_type(&frame_table[pfn], d, PGT_gdt_page) )
+ +            goto fail;
+ +    }
+ +
+ +    /* Check the remaining pages in the new GDT. */
+ +    for ( i = 1; i < nr_pages; i++ )
+ +        if ( ((pfn = frames[i]) >= max_page) ||
+ +             !get_page_and_type(&frame_table[pfn], d, PGT_gdt_page) )
+ +            goto fail;
+ +
+ +    /* Copy reserved GDT entries to the new GDT. */
+ +    vgdt = map_domain_mem(frames[0] << PAGE_SHIFT);
+ +    memcpy(vgdt + FIRST_RESERVED_GDT_ENTRY, 
+ +           gdt_table + FIRST_RESERVED_GDT_ENTRY, 
+ +           NR_RESERVED_GDT_ENTRIES*8);
+ +    unmap_domain_mem(vgdt);
+ +
+ +    /* Tear down the old GDT. */
+ +    destroy_gdt(ed);
+ +
+ +    /* Install the new GDT. */
+ +    for ( i = 0; i < nr_pages; i++ )
+ +        ed->arch.perdomain_ptes[i] =
+ +            mk_l1_pgentry((frames[i] << PAGE_SHIFT) | __PAGE_HYPERVISOR);
+ +
+ +    SET_GDT_ADDRESS(ed, GDT_VIRT_START(ed));
+ +    SET_GDT_ENTRIES(ed, entries);
+ +
+ +    return 0;
+ +
+ + fail:
+ +    while ( i-- > 0 )
+ +        put_page_and_type(&frame_table[frames[i]]);
+ +    return -EINVAL;
+ +}
+ +
+ +
+ +long do_set_gdt(unsigned long *frame_list, unsigned int entries)
+ +{
+ +    int nr_pages = (entries + 511) / 512;
+ +    unsigned long frames[16];
+ +    long ret;
+ +
+ +    if ( (entries <= LAST_RESERVED_GDT_ENTRY) || (entries > 8192) ) 
+ +        return -EINVAL;
+ +    
+ +    if ( copy_from_user(frames, frame_list, nr_pages * sizeof(unsigned long)) )
+ +        return -EFAULT;
+ +
+ +    LOCK_BIGLOCK(current->domain);
+ +
+ +    if ( (ret = set_gdt(current, frames, entries)) == 0 )
+ +    {
+ +        local_flush_tlb();
+ +        __asm__ __volatile__ ("lgdt %0" : "=m" (*current->arch.gdt));
+ +    }
+ +
+ +    UNLOCK_BIGLOCK(current->domain);
+ +
+ +    return ret;
+ +}
+ +
+ +
+ +long do_update_descriptor(
+ +    unsigned long pa, unsigned long word1, unsigned long word2)
+ +{
+ +    unsigned long pfn = pa >> PAGE_SHIFT;
+ +    struct desc_struct *gdt_pent, d;
+ +    struct pfn_info *page;
+ +    struct exec_domain *ed;
+ +    long ret = -EINVAL;
+ +
+ +    d.a = (u32)word1;
+ +    d.b = (u32)word2;
+ +
+ +    LOCK_BIGLOCK(current->domain);
+ +
+ +    if ( (pa & 7) || (pfn >= max_page) || !check_descriptor(&d) ) {
+ +        UNLOCK_BIGLOCK(current->domain);
+ +        return -EINVAL;
+ +    }
+ +
+ +    page = &frame_table[pfn];
+ +    if ( unlikely(!get_page(page, current->domain)) ) {
+ +        UNLOCK_BIGLOCK(current->domain);
+ +        return -EINVAL;
+ +    }
+ +
+ +    /* Check if the given frame is in use in an unsafe context. */
+ +    switch ( page->u.inuse.type_info & PGT_type_mask )
+ +    {
+ +    case PGT_gdt_page:
+ +        /* Disallow updates of Xen-reserved descriptors in the current GDT. */
+ +        for_each_exec_domain(current->domain, ed) {
+ +            if ( (l1_pgentry_to_pfn(ed->arch.perdomain_ptes[0]) == pfn) &&
+ +                 (((pa&(PAGE_SIZE-1))>>3) >= FIRST_RESERVED_GDT_ENTRY) &&
+ +                 (((pa&(PAGE_SIZE-1))>>3) <= LAST_RESERVED_GDT_ENTRY) )
+ +                goto out;
+ +        }
+ +        if ( unlikely(!get_page_type(page, PGT_gdt_page)) )
+ +            goto out;
+ +        break;
+ +    case PGT_ldt_page:
+ +        if ( unlikely(!get_page_type(page, PGT_ldt_page)) )
+ +            goto out;
+ +        break;
+ +    default:
+ +        if ( unlikely(!get_page_type(page, PGT_writable_page)) )
+ +            goto out;
+ +        break;
+ +    }
+ +
+ +    /* All is good so make the update. */
+ +    gdt_pent = map_domain_mem(pa);
+ +    memcpy(gdt_pent, &d, 8);
+ +    unmap_domain_mem(gdt_pent);
+ +
+ +    put_page_type(page);
+ +
+ +    ret = 0; /* success */
+ +
+ + out:
+ +    put_page(page);
+ +
+ +    UNLOCK_BIGLOCK(current->domain);
+ +
+ +    return ret;
+ +}
+ +
+ +
+ +
+ +/*************************
+ + * Writable Pagetables
+ + */
+ +
+ +ptwr_info_t ptwr_info[NR_CPUS];
+ +
+ +#ifdef VERBOSE
+ +int ptwr_debug = 0x0;
+ +#define PTWR_PRINTK(_f, _a...) \
+ + do { if ( unlikely(ptwr_debug) ) printk( _f , ## _a ); } while ( 0 )
+ +#define PTWR_PRINT_WHICH (which ? 'I' : 'A')
+ +#else
+ +#define PTWR_PRINTK(_f, _a...) ((void)0)
+ +#endif
+ +
+ +/* Flush the given writable p.t. page and write-protect it again. */
+ +void ptwr_flush(const int which)
+ +{
+ +    unsigned long  sstat, spte, pte, *ptep, l1va;
+ +    l1_pgentry_t  *sl1e = NULL, *pl1e, ol1e, nl1e;
+ +    l2_pgentry_t  *pl2e;
+ +    int            i, cpu = smp_processor_id();
+ +    struct exec_domain *ed = current;
+ +    struct domain *d = ed->domain;
+ +
+ +    l1va = ptwr_info[cpu].ptinfo[which].l1va;
+ +    ptep = (unsigned long *)&linear_pg_table[l1va>>PAGE_SHIFT];
+ +
+ +    /*
+ +     * STEP 1. Write-protect the p.t. page so no more updates can occur.
+ +     */
+ +
+ +    if ( unlikely(__get_user(pte, ptep)) )
+ +    {
+ +        MEM_LOG("ptwr: Could not read pte at %p\n", ptep);
+ +        /*
+ +         * Really a bug. We could read this PTE during the initial fault,
+ +         * and pagetables can't have changed meantime. XXX Multi-CPU guests?
+ +         */
+ +        BUG();
+ +    }
+ +    PTWR_PRINTK("[%c] disconnected_l1va at %p is %p\n",
+ +                PTWR_PRINT_WHICH, ptep, pte);
+ +    pte &= ~_PAGE_RW;
+ +
+ +    if ( unlikely(shadow_mode(d)) )
+ +    {
+ +        /* Write-protect the p.t. page in the shadow page table. */
+ +        l1pte_propagate_from_guest(d, &pte, &spte);
+ +        __put_user(
+ +            spte, (unsigned long *)&shadow_linear_pg_table[l1va>>PAGE_SHIFT]);
+ +
+ +        /* Is the p.t. page itself shadowed? Map it into Xen space if so. */
+ +        sstat = get_shadow_status(d, pte >> PAGE_SHIFT);
+ +        if ( sstat & PSH_shadowed )
+ +            sl1e = map_domain_mem((sstat & PSH_pfn_mask) << PAGE_SHIFT);
+ +    }
+ +
+ +    /* Write-protect the p.t. page in the guest page table. */
+ +    if ( unlikely(__put_user(pte, ptep)) )
+ +    {
+ +        MEM_LOG("ptwr: Could not update pte at %p\n", ptep);
+ +        /*
+ +         * Really a bug. We could write this PTE during the initial fault,
+ +         * and pagetables can't have changed meantime. XXX Multi-CPU guests?
+ +         */
+ +        BUG();
+ +    }
+ +
+ +    /* Ensure that there are no stale writable mappings in any TLB. */
+ +    /* NB. INVLPG is a serialising instruction: flushes pending updates. */
+ +#if 1
+ +    __flush_tlb_one(l1va); /* XXX Multi-CPU guests? */
+ +#else
+ +    flush_tlb_all();
+ +#endif
+ +    PTWR_PRINTK("[%c] disconnected_l1va at %p now %p\n",
+ +                PTWR_PRINT_WHICH, ptep, pte);
+ +
+ +    /*
+ +     * STEP 2. Validate any modified PTEs.
+ +     */
+ +
+ +    pl1e = ptwr_info[cpu].ptinfo[which].pl1e;
+ +    for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ )
+ +    {
+ +        ol1e = ptwr_info[cpu].ptinfo[which].page[i];
+ +        nl1e = pl1e[i];
+ +
+ +        if ( likely(l1_pgentry_val(ol1e) == l1_pgentry_val(nl1e)) )
+ +            continue;
+ +
+ +        /*
+ +         * Fast path for PTEs that have merely been write-protected
+ +         * (e.g., during a Unix fork()). A strict reduction in privilege.
+ +         */
+ +        if ( likely(l1_pgentry_val(ol1e) == (l1_pgentry_val(nl1e)|_PAGE_RW)) )
+ +        {
+ +            if ( likely(l1_pgentry_val(nl1e) & _PAGE_PRESENT) )
+ +            {
+ +                if ( unlikely(sl1e != NULL) )
+ +                    l1pte_propagate_from_guest(
+ +                        d, &l1_pgentry_val(nl1e), 
+ +                        &l1_pgentry_val(sl1e[i]));
+ +                put_page_type(&frame_table[l1_pgentry_to_pfn(nl1e)]);
+ +            }
+ +            continue;
+ +        }
+ +
+ +        if ( unlikely(!get_page_from_l1e(nl1e, d)) )
+ +        {
+ +            MEM_LOG("ptwr: Could not re-validate l1 page\n");
+ +            /*
+ +             * Make the remaining p.t's consistent before crashing, so the
+ +             * reference counts are correct.
+ +             */
+ +            memcpy(&pl1e[i], &ptwr_info[cpu].ptinfo[which].page[i],
+ +                   (ENTRIES_PER_L1_PAGETABLE - i) * sizeof(l1_pgentry_t));
+ +            unmap_domain_mem(pl1e);
+ +            ptwr_info[cpu].ptinfo[which].l1va = 0;
+ +            UNLOCK_BIGLOCK(d);
+ +            domain_crash();
+ +        }
+ +        
+ +        if ( unlikely(sl1e != NULL) )
+ +            l1pte_propagate_from_guest(
+ +                d, &l1_pgentry_val(nl1e), &l1_pgentry_val(sl1e[i]));
+ +
+ +        if ( unlikely(l1_pgentry_val(ol1e) & _PAGE_PRESENT) )
+ +            put_page_from_l1e(ol1e, d);
+ +    }
+ +    unmap_domain_mem(pl1e);
+ +
+ +    /*
+ +     * STEP 3. Reattach the L1 p.t. page into the current address space.
+ +     */
+ +
+ +    if ( (which == PTWR_PT_ACTIVE) && likely(!shadow_mode(d)) )
+ +    {
+ +        pl2e = &linear_l2_table[ptwr_info[cpu].ptinfo[which].l2_idx];
+ +        *pl2e = mk_l2_pgentry(l2_pgentry_val(*pl2e) | _PAGE_PRESENT); 
+ +    }
+ +
+ +    /*
+ +     * STEP 4. Final tidy-up.
+ +     */
+ +
+ +    ptwr_info[cpu].ptinfo[which].l1va = 0;
+ +
+ +    if ( unlikely(sl1e != NULL) )
+ +    {
+ +        unmap_domain_mem(sl1e);
+ +        put_shadow_status(d);
+ +    }
+ +}
+ +
+ +/* Write page fault handler: check if guest is trying to modify a PTE. */
+ +int ptwr_do_page_fault(unsigned long addr)
+ +{
+ +    unsigned long    pte, pfn, l2e;
+ +    struct pfn_info *page;
+ +    l2_pgentry_t    *pl2e;
+ +    int              which, cpu = smp_processor_id();
+ +    u32              l2_idx;
+ +
+ +#ifdef __x86_64__
+ +    return 0; /* Writable pagetables need fixing for x86_64. */
+ +#endif
+ +
+ +    /*
+ +     * Attempt to read the PTE that maps the VA being accessed. By checking for
+ +     * PDE validity in the L2 we avoid many expensive fixups in __get_user().
+ +     */
+ +    if ( !(l2_pgentry_val(linear_l2_table[addr>>L2_PAGETABLE_SHIFT]) &
+ +           _PAGE_PRESENT) ||
+ +         __get_user(pte, (unsigned long *)&linear_pg_table[addr>>PAGE_SHIFT]) )
+ +    {
+ +        return 0;
+ +    }
+ +
+ +    pfn  = pte >> PAGE_SHIFT;
+ +    page = &frame_table[pfn];
+ +
+ +    /* We are looking only for read-only mappings of p.t. pages. */
+ +    if ( ((pte & (_PAGE_RW | _PAGE_PRESENT)) != _PAGE_PRESENT) ||
+ +         ((page->u.inuse.type_info & PGT_type_mask) != PGT_l1_page_table) )
+ +    {
+ +        return 0;
+ +    }
+ +    
+ +    /* Get the L2 index at which this L1 p.t. is always mapped. */
+ +    l2_idx = page->u.inuse.type_info & PGT_va_mask;
+ +    if ( unlikely(l2_idx >= PGT_va_unknown) )
+ +    {
+ +        domain_crash(); /* Urk! This L1 is mapped in multiple L2 slots! */
+ +    }
+ +    l2_idx >>= PGT_va_shift;
+ +
+ +    if ( l2_idx == (addr >> L2_PAGETABLE_SHIFT) )
+ +    {
+ +        MEM_LOG("PTWR failure! Pagetable maps itself at %p\n", addr);
+ +        domain_crash();
+ +    }
+ +
+ +    /*
+ +     * Is the L1 p.t. mapped into the current address space? If so we call it
+ +     * an ACTIVE p.t., otherwise it is INACTIVE.
+ +     */
+ +    pl2e = &linear_l2_table[l2_idx];
+ +    l2e  = l2_pgentry_val(*pl2e);
+ +    which = PTWR_PT_INACTIVE;
+ +    if ( (l2e >> PAGE_SHIFT) == pfn )
+ +    {
+ +        /* Check the PRESENT bit to set ACTIVE. */
+ +        if ( likely(l2e & _PAGE_PRESENT) )
+ +            which = PTWR_PT_ACTIVE;
+ +        else {
+ +            /*
+ +             * If the PRESENT bit is clear, we may be conflicting with
+ +             * the current ACTIVE p.t. (it may be the same p.t. mapped
+ +             * at another virt addr).
+ +             * The ptwr_flush call below will restore the PRESENT bit.
+ +             */
+ +            if ( ptwr_info[cpu].ptinfo[PTWR_PT_ACTIVE].l1va &&
+ +                 l2_idx == ptwr_info[cpu].ptinfo[PTWR_PT_ACTIVE].l2_idx )
+ +                which = PTWR_PT_ACTIVE;
+ +        }
+ +    }
+ +    
+ +    PTWR_PRINTK("[%c] page_fault on l1 pt at va %p, pt for %08x, "
+ +                "pfn %p\n", PTWR_PRINT_WHICH,
+ +                addr, l2_idx << L2_PAGETABLE_SHIFT, pfn);
+ +    
+ +    /*
+ +     * We only allow one ACTIVE and one INACTIVE p.t. to be updated at at 
+ +     * time. If there is already one, we must flush it out.
+ +     */
+ +    if ( ptwr_info[cpu].ptinfo[which].l1va )
+ +        ptwr_flush(which);
+ +
+ +    ptwr_info[cpu].ptinfo[which].l1va   = addr | 1;
+ +    ptwr_info[cpu].ptinfo[which].l2_idx = l2_idx;
+ +    
+ +    /* For safety, disconnect the L1 p.t. page from current space. */
+ +    if ( (which == PTWR_PT_ACTIVE) && 
+ +         likely(!shadow_mode(current->domain)) )
+ +    {
+ +        *pl2e = mk_l2_pgentry(l2e & ~_PAGE_PRESENT);
+ +#if 1
+ +        flush_tlb(); /* XXX Multi-CPU guests? */
+ +#else
+ +        flush_tlb_all();
+ +#endif
+ +    }
+ +    
+ +    /* Temporarily map the L1 page, and make a copy of it. */
+ +    ptwr_info[cpu].ptinfo[which].pl1e = map_domain_mem(pfn << PAGE_SHIFT);
+ +    memcpy(ptwr_info[cpu].ptinfo[which].page,
+ +           ptwr_info[cpu].ptinfo[which].pl1e,
+ +           ENTRIES_PER_L1_PAGETABLE * sizeof(l1_pgentry_t));
+ +    
+ +    /* Finally, make the p.t. page writable by the guest OS. */
+ +    pte |= _PAGE_RW;
+ +    PTWR_PRINTK("[%c] update %p pte to %p\n", PTWR_PRINT_WHICH,
+ +                &linear_pg_table[addr>>PAGE_SHIFT], pte);
+ +    if ( unlikely(__put_user(pte, (unsigned long *)
+ +                             &linear_pg_table[addr>>PAGE_SHIFT])) )
+ +    {
+ +        MEM_LOG("ptwr: Could not update pte at %p\n", (unsigned long *)
+ +                &linear_pg_table[addr>>PAGE_SHIFT]);
+ +        /* Toss the writable pagetable state and crash. */
+ +        unmap_domain_mem(ptwr_info[cpu].ptinfo[which].pl1e);
+ +        ptwr_info[cpu].ptinfo[which].l1va = 0;
+ +        domain_crash();
+ +    }
+ +    
+ +    return EXCRET_fault_fixed;
+ +}
+ +
+ +static __init int ptwr_init(void)
+ +{
+ +    int i;
+ +
+ +    for ( i = 0; i < smp_num_cpus; i++ )
+ +    {
+ +        ptwr_info[i].ptinfo[PTWR_PT_ACTIVE].page =
+ +            (void *)alloc_xenheap_page();
+ +        ptwr_info[i].ptinfo[PTWR_PT_INACTIVE].page =
+ +            (void *)alloc_xenheap_page();
+ +    }
+ +
+ +    return 0;
+ +}
+ +__initcall(ptwr_init);
+ +
+ +
+ +
+ +
+ +/************************************************************************/
+ +/************************************************************************/
+ +/************************************************************************/
+ +
+ +#ifndef NDEBUG
+ +
+ +void ptwr_status(void)
+ +{
+ +    unsigned long pte, *ptep, pfn;
+ +    struct pfn_info *page;
+ +    int cpu = smp_processor_id();
+ +
+ +    ptep = (unsigned long *)&linear_pg_table
+ +        [ptwr_info[cpu].ptinfo[PTWR_PT_INACTIVE].l1va>>PAGE_SHIFT];
+ +
+ +    if ( __get_user(pte, ptep) ) {
+ +        MEM_LOG("ptwr: Could not read pte at %p\n", ptep);
+ +        domain_crash();
+ +    }
+ +
+ +    pfn = pte >> PAGE_SHIFT;
+ +    page = &frame_table[pfn];
+ +    printk("need to alloc l1 page %p\n", page);
+ +    /* make pt page writable */
+ +    printk("need to make read-only l1-page at %p is %p\n",
+ +           ptep, pte);
+ +
+ +    if ( ptwr_info[cpu].ptinfo[PTWR_PT_ACTIVE].l1va == 0 )
+ +        return;
+ +
+ +    if ( __get_user(pte, (unsigned long *)
+ +                    ptwr_info[cpu].ptinfo[PTWR_PT_ACTIVE].l1va) ) {
+ +        MEM_LOG("ptwr: Could not read pte at %p\n", (unsigned long *)
+ +                ptwr_info[cpu].ptinfo[PTWR_PT_ACTIVE].l1va);
+ +        domain_crash();
+ +    }
+ +    pfn = pte >> PAGE_SHIFT;
+ +    page = &frame_table[pfn];
+ +}
+ +
+ +void audit_domain(struct domain *d)
+ +{
+ +    int ttot=0, ctot=0, io_mappings=0, lowmem_mappings=0;
+ +
+ +    void adjust (struct pfn_info *page, int dir, int adjtype)
+ +    {
+ +        int count = page->count_info & PGC_count_mask;
+ +
+ +        if ( adjtype )
+ +        {
+ +            int tcount = page->u.inuse.type_info & PGT_count_mask;
+ +            
+ +            ttot++;
+ +
+ +            tcount += dir;
+ +
+ +            if ( tcount < 0 )
+ +            {
+ +                /* This will only come out once. */
+ +                printk("Audit %d: type count whent below zero pfn=%x "
+ +                       "taf=%x otaf=%x\n",
+ +                       d->id, page-frame_table,
+ +                       page->u.inuse.type_info,
+ +                       page->tlbflush_timestamp);
+ +            }
+ +            
+ +            page->u.inuse.type_info =
+ +                (page->u.inuse.type_info & ~PGT_count_mask) | 
+ +                (tcount & PGT_count_mask);
+ +        }
+ +
+ +        ctot++;
+ +        count += dir;
+ +        if ( count < 0 )
+ +        {
+ +            /* This will only come out once. */
+ +            printk("Audit %d: general count whent below zero pfn=%x "
+ +                   "taf=%x otaf=%x\n",
+ +                   d->id, page-frame_table,
+ +                   page->u.inuse.type_info,
+ +                   page->tlbflush_timestamp);
+ +        }
+ +            
+ +        page->count_info =
+ +            (page->count_info & ~PGC_count_mask) | 
+ +            (count & PGC_count_mask);            
+ +
+ +    }
+ +
+ +    void scan_for_pfn(struct domain *d, unsigned long xpfn)
+ +    {
+ +        unsigned long pfn, *pt;
+ +        struct list_head *list_ent;
+ +        struct pfn_info *page;
+ +        int i;
+ +
+ +        list_ent = d->page_list.next;
+ +        for ( i = 0; (list_ent != &d->page_list); i++ )
+ +        {
+ +            pfn = list_entry(list_ent, struct pfn_info, list) - frame_table;
+ +            page = &frame_table[pfn];
+ +            
+ +            switch ( page->u.inuse.type_info & PGT_type_mask )
+ +            {
+ +            case PGT_l1_page_table:
+ +            case PGT_l2_page_table:
+ +                pt = map_domain_mem(pfn<<PAGE_SHIFT);
+ +                for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ )
+ +                    if ( (pt[i] & _PAGE_PRESENT) &&
+ +                         ((pt[i] >> PAGE_SHIFT) == xpfn) )
+ +                        printk("     found dom=%d i=%x pfn=%lx t=%x c=%x\n",
+ +                               d->id, i, pfn, page->u.inuse.type_info,
+ +                               page->count_info);
+ +                unmap_domain_mem(pt);           
+ +            }
+ +
+ +            list_ent = frame_table[pfn].list.next;
+ +        }
+ +
+ +    }
+ +
+ +    void scan_for_pfn_remote(unsigned long xpfn)
+ +    {
+ +        struct domain *e;
+ +        for_each_domain ( e )
+ +            scan_for_pfn( e, xpfn );            
+ +    }   
+ +
- 
-     adjust(&frame_table[pagetable_val(d->exec_domain[0]->arch.pagetable)>>PAGE_SHIFT], -1, 1);
++    int i, l1, l2;
+ +    unsigned long pfn;
+ +    struct list_head *list_ent;
+ +    struct pfn_info *page;
+ +
+ +    if ( d != current->domain )
+ +        domain_pause(d);
+ +    synchronise_pagetables(~0UL);
+ +
+ +    printk("pt base=%lx sh_info=%x\n",
+ +           pagetable_val(d->exec_domain[0]->arch.pagetable)>>PAGE_SHIFT,
+ +           virt_to_page(d->shared_info)-frame_table);
+ +           
+ +    spin_lock(&d->page_alloc_lock);
+ +
+ +    /* PHASE 0 */
+ +
+ +    list_ent = d->page_list.next;
+ +    for ( i = 0; (list_ent != &d->page_list); i++ )
+ +    {
+ +        pfn = list_entry(list_ent, struct pfn_info, list) - frame_table;       
+ +        page = &frame_table[pfn];
+ +
+ +        if ( page_get_owner(page) != d )
+ +            BUG();
+ +
+ +        if ( (page->u.inuse.type_info & PGT_count_mask) >
+ +             (page->count_info & PGC_count_mask) )
+ +            printk("taf > caf %x %x pfn=%lx\n",
+ +                   page->u.inuse.type_info, page->count_info, pfn );
+ + 
+ +#if 0   /* SYSV shared memory pages plus writeable files. */
+ +        if ( (page->u.inuse.type_info & PGT_type_mask) == PGT_writable_page && 
+ +             (page->u.inuse.type_info & PGT_count_mask) > 1 )
+ +        {
+ +            printk("writeable page with type count >1: pfn=%lx t=%x c=%x\n",
+ +                  pfn,
+ +                  page->u.inuse.type_info,
+ +                  page->count_info );
+ +            scan_for_pfn_remote(pfn);
+ +        }
+ +#endif
+ +        if ( (page->u.inuse.type_info & PGT_type_mask) == PGT_none && 
+ +             (page->u.inuse.type_info & PGT_count_mask) > 1 )
+ +        {
+ +            printk("normal page with type count >1: pfn=%lx t=%x c=%x\n",
+ +                  pfn,
+ +                  page->u.inuse.type_info,
+ +                  page->count_info );
+ +        }
+ +
+ +        /* Use tlbflush_timestamp to store original type_info. */
+ +        page->tlbflush_timestamp = page->u.inuse.type_info;
+ +
+ +        list_ent = frame_table[pfn].list.next;
+ +    }
+ +
+ +
+ +    /* PHASE 1 */
-     adjust(&frame_table[pagetable_val(
-         d->exec_domain[0]->arch.pagetable)>>PAGE_SHIFT], 1, 1);
++    if( pagetable_val(d->exec_domain[0]->arch.pagetable) )
++      adjust(&frame_table[pagetable_val(d->exec_domain[0]->arch.pagetable)>>PAGE_SHIFT], -1, 1);
+ +
+ +    list_ent = d->page_list.next;
+ +    for ( i = 0; (list_ent != &d->page_list); i++ )
+ +    {
+ +        unsigned long *pt;
+ +        pfn = list_entry(list_ent, struct pfn_info, list) - frame_table;       
+ +        page = &frame_table[pfn];
+ +
+ +        if ( page_get_owner(page) != d )
+ +            BUG();
+ +
+ +        switch ( page->u.inuse.type_info & PGT_type_mask )
+ +        {
+ +        case PGT_l2_page_table:
+ +
+ +            if ( (page->u.inuse.type_info & PGT_validated) != PGT_validated )
+ +                printk("Audit %d: L2 not validated %x\n",
+ +                       d->id, page->u.inuse.type_info);
+ +
+ +            if ( (page->u.inuse.type_info & PGT_pinned) != PGT_pinned )
+ +                printk("Audit %d: L2 not pinned %x\n",
+ +                       d->id, page->u.inuse.type_info);
+ +            else
+ +                adjust( page, -1, 1 );
+ +           
+ +            pt = map_domain_mem( pfn<<PAGE_SHIFT );
+ +
+ +            for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ )
+ +            {
+ +                if ( pt[i] & _PAGE_PRESENT )
+ +                {
+ +                    unsigned long l1pfn = pt[i]>>PAGE_SHIFT;
+ +                    struct pfn_info *l1page = &frame_table[l1pfn];
+ +
+ +                    if ( page_get_owner(l1page) != d )
+ +                    {
+ +                        printk("L2: Skip bizarre page belonging to other "
+ +                               "dom %p\n", page_get_owner(l1page));
+ +                        continue;
+ +                    }
+ +                    
+ +                    if ( (l1page->u.inuse.type_info & PGT_type_mask) ==
+ +                         PGT_l2_page_table )
+ +                        printk("Audit %d: [%x] Found %s Linear PT "
+ +                               "t=%x pfn=%lx\n", d->id, i, 
+ +                               (l1pfn==pfn) ? "Self" : "Other",
+ +                               l1page->u.inuse.type_info,
+ +                               l1pfn);
+ +                    else if ( (l1page->u.inuse.type_info & PGT_type_mask) !=
+ +                              PGT_l1_page_table )
+ +                        printk("Audit %d: [%x] Expected L1 t=%x pfn=%lx\n",
+ +                               d->id, i,
+ +                               l1page->u.inuse.type_info,
+ +                               l1pfn);
+ +
+ +                    adjust(l1page, -1, 1);
+ +                }
+ +            }
+ +
+ +            unmap_domain_mem(pt);
+ +
+ +            break;
+ +
+ +
+ +        case PGT_l1_page_table:
+ +            
+ +            if ( (page->u.inuse.type_info & PGT_pinned) == PGT_pinned )
+ +                adjust( page, -1, 1 );
+ +
+ +            if ( (page->u.inuse.type_info & PGT_validated) != PGT_validated )
+ +                printk("Audit %d: L1 not validated %x\n",
+ +                       d->id, page->u.inuse.type_info);
+ +#if 0
+ +            if ( (page->u.inuse.type_info & PGT_pinned) != PGT_pinned )
+ +                printk("Audit %d: L1 not pinned %x\n",
+ +                       d->id, page->u.inuse.type_info);
+ +#endif
+ +            pt = map_domain_mem( pfn<<PAGE_SHIFT );
+ +
+ +            for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ )
+ +            {
+ +                if ( pt[i] & _PAGE_PRESENT )
+ +                {
+ +                    unsigned long l1pfn = pt[i]>>PAGE_SHIFT;
+ +                    struct pfn_info *l1page = &frame_table[l1pfn];
+ +
+ +                    if ( l1pfn < 0x100 )
+ +                    {
+ +                        lowmem_mappings++;
+ +                        continue;
+ +                    }
+ +
+ +                    if ( l1pfn > max_page )
+ +                    {
+ +                        io_mappings++;
+ +                        continue;
+ +                    }
+ +
+ +                    if ( pt[i] & _PAGE_RW )
+ +                    {
+ +
+ +                        if ( (l1page->u.inuse.type_info & PGT_type_mask) ==
+ +                             PGT_l1_page_table ||
+ +                             (l1page->u.inuse.type_info & PGT_type_mask) ==
+ +                             PGT_l2_page_table )
+ +                            printk("Audit %d: [%x] Ilegal RW t=%x pfn=%lx\n",
+ +                                   d->id, i,
+ +                                   l1page->u.inuse.type_info,
+ +                                   l1pfn);
+ +
+ +                    }
+ +
+ +                    if ( page_get_owner(l1page) != d )
+ +                    {
+ +                        printk("Audit %d: [%lx,%x] Skip foreign page dom=%p "
+ +                               "pfn=%lx c=%08x t=%08x m2p=%lx\n",
+ +                               d->id, pfn, i,
+ +                               page_get_owner(l1page),
+ +                               l1pfn,
+ +                               l1page->count_info,
+ +                               l1page->u.inuse.type_info,
+ +                               machine_to_phys_mapping[l1pfn]);    
+ +                        continue;
+ +                    }
+ +
+ +                    adjust(l1page, -1, 0);
+ +                }
+ +            }
+ +
+ +            unmap_domain_mem(pt);
+ +
+ +            break;
+ +        }       
+ +
+ +        list_ent = frame_table[pfn].list.next;
+ +    }
+ +
+ +    if ( (io_mappings > 0) || (lowmem_mappings > 0) )
+ +        printk("Audit %d: Found %d lowmem mappings and %d io mappings\n",
+ +               d->id, lowmem_mappings, io_mappings);
+ +
+ +    /* PHASE 2 */
+ +
+ +    ctot = ttot = 0;
+ +    list_ent = d->page_list.next;
+ +    for ( i = 0; (list_ent != &d->page_list); i++ )
+ +    {
+ +        pfn = list_entry(list_ent, struct pfn_info, list) - frame_table;
+ +        page = &frame_table[pfn];
+ +
+ +        switch ( page->u.inuse.type_info & PGT_type_mask)
+ +        {
+ +        case PGT_l1_page_table:
+ +        case PGT_l2_page_table:
+ +            if ( (page->u.inuse.type_info & PGT_count_mask) != 0 )
+ +            {
+ +                printk("Audit %d: type count!=0 t=%x ot=%x c=%x pfn=%lx\n",
+ +                       d->id, page->u.inuse.type_info, 
+ +                       page->tlbflush_timestamp,
+ +                       page->count_info, pfn );
+ +                scan_for_pfn_remote(pfn);
+ +            }
+ +        default:
+ +            if ( (page->count_info & PGC_count_mask) != 1 )
+ +            {
+ +                printk("Audit %d: gen count!=1 (c=%x) t=%x ot=%x pfn=%lx\n",
+ +                       d->id, 
+ +                       page->count_info,
+ +                       page->u.inuse.type_info, 
+ +                       page->tlbflush_timestamp, pfn );
+ +                scan_for_pfn_remote(pfn);
+ +            }
+ +            break;
+ +        }
+ +
+ +        list_ent = frame_table[pfn].list.next;
+ +    }
+ +
+ +    /* PHASE 3 */
+ +    list_ent = d->page_list.next;
++    l1 = l2 = 0;
+ +    for ( i = 0; (list_ent != &d->page_list); i++ )
+ +    {
+ +        unsigned long *pt;
+ +        pfn = list_entry(list_ent, struct pfn_info, list) - frame_table;
+ +        page = &frame_table[pfn];
+ +
+ +        switch ( page->u.inuse.type_info & PGT_type_mask )
+ +        {
+ +        case PGT_l2_page_table:
++          l2++;
+ +            if ( (page->u.inuse.type_info & PGT_pinned) == PGT_pinned )
+ +                adjust( page, 1, 1 );          
+ +
+ +            pt = map_domain_mem( pfn<<PAGE_SHIFT );
+ +
+ +            for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ )
+ +            {
+ +                if ( pt[i] & _PAGE_PRESENT )
+ +                {
+ +                    unsigned long l1pfn = pt[i]>>PAGE_SHIFT;
+ +                    struct pfn_info *l1page;
+ +
+ +                    if (l1pfn>max_page)
+ +                        continue;
+ +
+ +                    l1page = &frame_table[l1pfn];
+ +
+ +                    if ( page_get_owner(l1page) == d )
+ +                        adjust(l1page, 1, 1);
+ +                }
+ +            }
+ +
+ +            unmap_domain_mem(pt);
+ +            break;
+ +
+ +        case PGT_l1_page_table:
++          l1++;
+ +            if ( (page->u.inuse.type_info & PGT_pinned) == PGT_pinned )
+ +                adjust( page, 1, 1 );
+ +
+ +            pt = map_domain_mem( pfn<<PAGE_SHIFT );
+ +
+ +            for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ )
+ +            {
+ +                if ( pt[i] & _PAGE_PRESENT )
+ +                {
+ +                    unsigned long l1pfn = pt[i]>>PAGE_SHIFT;
+ +                    struct pfn_info *l1page;
+ +
+ +                    if (l1pfn>max_page)
+ +                        continue;
+ +
+ +                    l1page = &frame_table[l1pfn];
+ +
+ +                    if ( (page_get_owner(l1page) != d) ||
+ +                         (l1pfn < 0x100) || (l1pfn > max_page) )
+ +                        continue;
+ +
+ +                    adjust(l1page, 1, 0);
+ +                }
+ +            }
+ +
+ +            unmap_domain_mem(pt);
+ +            break;
+ +        }
+ +
+ +
+ +        page->tlbflush_timestamp = 0;
+ +
+ +        list_ent = frame_table[pfn].list.next;
+ +    }
+ +
+ +    spin_unlock(&d->page_alloc_lock);
+ +
-     printk("Audit %d: Done. ctot=%d ttot=%d\n", d->id, ctot, ttot );
++    if( pagetable_val(d->exec_domain[0]->arch.pagetable) )
++      adjust(&frame_table[pagetable_val(d->exec_domain[0]->arch.pagetable)>>PAGE_SHIFT], 1, 1);
+ +
++    printk("Audit %d: Done. pages=%d l1=%d l2=%d ctot=%d ttot=%d\n", d->id, i, l1, l2, ctot, ttot );
+ +
+ +    if ( d != current->domain )
+ +        domain_unpause(d);
+ +}
+ +
+ +void audit_domains(void)
+ +{
+ +    struct domain *d;
+ +    for_each_domain ( d )
+ +        audit_domain(d);
+ +}
+ +
+ +void audit_domains_key(unsigned char key)
+ +{
+ +    audit_domains();
+ +}
+ +
+ +#endif
diff --cc xen/arch/x86/x86_32/entry.S
Simple merge
diff --cc xen/include/xen/perfc_defn.h
Simple merge
author	mafetter@fleming.research <mafetter@fleming.research>
	Sat, 12 Feb 2005 17:26:12 +0000 (17:26 +0000)
committer	mafetter@fleming.research <mafetter@fleming.research>
	Sat, 12 Feb 2005 17:26:12 +0000 (17:26 +0000)
		1	2
.rootkeys	patch \|	diff1 \|	diff2 \|	blob \| history
linux-2.4.29-xen-sparse/arch/xen/kernel/setup.c	patch \|	diff1 \|	diff2 \|	blob \| history
linux-2.6.10-xen-sparse/arch/xen/i386/kernel/setup.c	patch \|	diff1 \|	diff2 \|	blob \| history
linux-2.6.10-xen-sparse/drivers/xen/blkfront/block.h	patch \|	diff1 \|	diff2 \|	blob \| history
linux-2.6.10-xen-sparse/drivers/xen/netback/netback.c	patch \|	diff1 \|	diff2 \|	blob \| history
linux-2.6.10-xen-sparse/drivers/xen/netfront/netfront.c	patch \|	diff1 \|	diff2 \|	blob \| history
xen/arch/x86/domain.c	patch \|	diff1 \|	diff2 \|	blob \| history
xen/arch/x86/extable.c	patch \|	diff1 \|	diff2 \|	blob \| history
xen/arch/x86/mm.c	patch \|	diff1 \|	\|	blob \| history
xen/arch/x86/x86_32/entry.S	patch \|	diff1 \|	diff2 \|	blob \| history
xen/include/xen/perfc_defn.h	patch \|	diff1 \|	diff2 \|	blob \| history